From 45a3008ceb0b9f55b23fdc3dc8d4f4be480b86aa Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 10 Nov 2025 08:02:17 +0000 Subject: [PATCH 001/578] feat: Integrate AITER bpreshuffle and ck operators on top of fp8 refactor Signed-off-by: vllmellm --- .../schemes/compressed_tensors_w8a8_fp8.py | 2 + .../kernels/scaled_mm/__init__.py | 4 + .../quantization/kernels/scaled_mm/aiter.py | 217 +++++++++++++++++- 3 files changed, 222 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 2cd29e0905d0..e25d2aaa439b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -192,6 +192,8 @@ def process_weights_after_loading(self, layer) -> None: if self.strategy == QuantizationStrategy.BLOCK: maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported) + self.fp8_linear.process_weights_after_loading(layer) + def apply_weights( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index b033cc7905e4..b8c7f78aac64 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -8,6 +8,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( + AiterBpreshufflePerTokenFp8ScaledMMLinearKernel, + AiterCKPerTokenFp8ScaledMMLinearKernel, AiterScaledMMLinearKernel, ) from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( @@ -64,6 +66,8 @@ ChannelWiseTorchScaledMMLinearKernel, ], PlatformEnum.ROCM: [ + AiterBpreshufflePerTokenFp8ScaledMMLinearKernel, + AiterCKPerTokenFp8ScaledMMLinearKernel, ROCmScaledMMLinearKernel, PerTensorTorchScaledMMLinearKernel, RowWiseTorchScaledMMLinearKernel, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 3ac90553bbc7..430e407156c5 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -2,15 +2,25 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + import torch +from aiter.ops.shuffle import shuffle_weight import vllm.envs as envs from vllm import _custom_ops as ops +from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op from .cutlass import CutlassScaledMMLinearKernel -from .ScaledMMLinearKernel import Int8ScaledMMLinearLayerConfig +from .ScaledMMLinearKernel import ( + FP8ScaledMMLinearKernel, + FP8ScaledMMLinearLayerConfig, + Int8ScaledMMLinearLayerConfig, +) + +logger = init_logger(__name__) def rocm_aiter_gemm_w8a8_impl( @@ -52,6 +62,54 @@ def rocm_aiter_gemm_w8a8_fake( ) +# bpshuffle +def rocm_aiter_gemm_a8w8_bpreshuffle_impl( + input: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype | None = None, + scale_a: torch.Tensor | None = None, + scale_b: torch.Tensor | None = None, +) -> torch.Tensor: + # This AITER function can be used for + # - per-token activations + per-channel weights + # e.g. vllm/model_executor/layers/quantization/utils/w8a8_utils.py + # accept the weight as # keep the weight as (N, K) + # NOTE: The weight has to be shuffled in the + # process_weights_after_loading of the CompressedTensorsW8A8Fp8 class + + from aiter import gemm_a8w8_bpreshuffle_ck + + m = input.shape[0] + n = weight.shape[0] + Y = torch.empty(m, n, dtype=out_dtype, device=input.device) + gemm_a8w8_bpreshuffle_ck(input, weight, scale_a, scale_b, Y) + return Y + + +def rocm_aiter_gemm_a8w8_bpreshuffle_fake( + input: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype | None = None, + scale_a: torch.Tensor | None = None, + scale_b: torch.Tensor | None = None, +) -> torch.Tensor: + m = input.shape[0] + n = weight.shape[0] + if out_dtype is None: + out_dtype = input.dtype + return torch.empty((m, n), dtype=out_dtype, device=input.device) + + +if current_platform.is_rocm(): + direct_register_custom_op( + op_name="rocm_aiter_gemm_a8w8_bpreshuffle", + op_func=rocm_aiter_gemm_a8w8_bpreshuffle_impl, + mutates_args=[], + fake_impl=rocm_aiter_gemm_a8w8_bpreshuffle_fake, + dispatch_key=current_platform.dispatch_key, + ) + + class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): @classmethod def get_min_capability(cls) -> int: @@ -157,3 +215,160 @@ def apply_weights( return torch.ops.vllm.rocm_aiter_gemm_w8a8( x_q, w_q.t(), x_s, w_s, bias, out_dtype ) + + +# bpreshuffle +class AiterBpreshufflePerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): + def get_ouput_padding(self) -> int | None: + # PTPC kernels do not require padding. + return None + + @classmethod + def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_rocm(): + return (False, "AITER bpreshuffle is ROCm-only") + if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER): + return (False, "AITER bpreshuffle is disabled by env var") + try: + import aiter # noqa: F401 + except Exception: + return (False, "AITER not installed") + + # Check if the configuration is PTPC + is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token() + is_per_token_activation = ( + c.activation_quant_key.scale.group_shape.is_per_token() + ) + is_ptpc = is_per_channel_weight and is_per_token_activation + + logger.info_once(f"AiterBpreshuffle: can_implement called. is_ptpc={is_ptpc}") + + if not is_ptpc: + return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)") + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + logger.info_once("AiterBpreshuffle: SHUFFLING WEIGHTS NOW.") + + w_q, _, _, _ = self._get_layer_params(layer) + + N = w_q.shape[1] + K = w_q.shape[0] + + if N % 16 == 0 and K % 16 == 0: + # AITER shuffle_weight expectation [N, K] + w_q_nk = w_q.t().contiguous() + + # Execute shuffle + shuffled_w_nk = shuffle_weight(w_q_nk, layout=(16, 16)) + + del layer.weight + layer.register_buffer("weight", shuffled_w_nk) + + logger.info_once("[AiterBpreshuffle: Weight shuffle COMPLETE.") + + else: + raise ValueError( + f"Weight shape (N={N}, K={K}) not divisible by 16 " + "for AITER bpreshuffle." + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # 1. Obtain parameters + w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer) + # 2. Dynamic quantization input + qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub) + + logger.info_once( + "AiterBpreshuffle: apply_weights... ABOUT TO CALL C++ KERNEL..." + ) + + # 3. Call the AITER bpreshuffle CK operator. + output = torch.ops.vllm.rocm_aiter_gemm_a8w8_bpreshuffle( + qinput, + w_q, # Input [N, K] shuffle weights + out_dtype=self.config.out_dtype, + scale_a=qinput_scale, + scale_b=w_s, + ) + + logger.info_once("AiterBpreshuffle: C++ KERNEL CALL SUCCEEDED.") + + if bias is not None: + output.add_(bias) + return output + + def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]: + return rocm_aiter_gemm_a8w8_bpreshuffle_impl + + +# AITER FP8 CK +class AiterCKPerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): + """ + AITER PTPC kernel (gemm_a8w8_CK) without pre-shuffling. + """ + + def get_ouput_padding(self) -> int | None: + return None + + @classmethod + def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_rocm(): + return (False, "AITER CK is ROCm-only") + if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER): + return (False, "AITER CK is disabled by env var") + try: + import aiter # noqa: F401 + except Exception: + return (False, "AITER not installed") + + is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token() + is_per_token_activation = ( + c.activation_quant_key.scale.group_shape.is_per_token() + ) + is_ptpc = is_per_channel_weight and is_per_token_activation + + logger.info_once(f"AiterCK: can_implement called. is_ptpc={is_ptpc}") + + if not is_ptpc: + return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)") + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + logger.info_once( + "AITER CK: process_weights_after_loading... DOING NOTHING (pass)." + ) + pass + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer) + + qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub) + + logger.info_once( + "AiterCK: apply_weights... " + "ABOUT TO CALL C++ KERNEL (this is where it hangs)..." + ) + + output = torch.ops.vllm.rocm_aiter_gemm_w8a8( + qinput, w_q.t(), qinput_scale, w_s, bias, self.config.out_dtype + ) + + logger.info_once("AiterCK: C++ KERNEL CALL SUCCEEDED.") + + return output + + def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]: + return rocm_aiter_gemm_w8a8_impl From fa183e92713456dec682088a362dd9908100cc03 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 13 Nov 2025 15:59:58 +0800 Subject: [PATCH 002/578] [Bugfix] fix kimi-linear crash (#28445) Signed-off-by: zjy0516 --- vllm/model_executor/layers/kda.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 26458f2e3c4d..2e7500bac718 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -44,7 +44,6 @@ def kda_attention( k_proj_states: torch.Tensor, v_proj_states: torch.Tensor, g1: torch.Tensor, - g2: torch.Tensor, beta: torch.Tensor, core_attn_out: torch.Tensor, layer_name: str, @@ -56,7 +55,6 @@ def kda_attention( k_proj_states=k_proj_states, v_proj_states=v_proj_states, g1=g1, - g2=g2, beta=beta, core_attn_out=core_attn_out, ) @@ -67,7 +65,6 @@ def kda_attention_fake( k_proj_states: torch.Tensor, v_proj_states: torch.Tensor, g1: torch.Tensor, - g2: torch.Tensor, beta: torch.Tensor, core_attn_out: torch.Tensor, layer_name: str, @@ -284,7 +281,6 @@ def forward( k, v, g1, - g2, beta, core_attn_out, self.prefix, @@ -299,7 +295,6 @@ def _forward( k_proj_states: torch.Tensor, v_proj_states: torch.Tensor, g1: torch.Tensor, - g2: torch.Tensor, beta: torch.Tensor, core_attn_out: torch.Tensor, ) -> None: @@ -316,8 +311,15 @@ def _forward( has_initial_state = attn_metadata.has_initial_state non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + num_actual_tokens = attn_metadata.num_actual_tokens constant_caches = self.kv_cache[forward_context.virtual_engine] + q_proj_states = q_proj_states[:num_actual_tokens] + k_proj_states = k_proj_states[:num_actual_tokens] + v_proj_states = v_proj_states[:num_actual_tokens] + g1 = g1[:num_actual_tokens] + beta = beta[:num_actual_tokens] + (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches # deal with strides conv_state_q = conv_state_q.transpose(-1, -2) @@ -372,7 +374,7 @@ def _forward( ).transpose(0, 1) else: decode_conv_indices = non_spec_state_indices_tensor[ - : attn_metadata.num_decodes + : attn_metadata.num_actual_tokens ] q = causal_conv1d_update( q_proj_states, @@ -438,8 +440,9 @@ def _forward( beta=beta, initial_state=recurrent_state, use_qk_l2norm_in_kernel=True, - cu_seqlens=non_spec_query_start_loc, + cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1], ssm_state_indices=non_spec_state_indices_tensor, ) - assert core_attn_out_non_spec.shape == core_attn_out.shape - core_attn_out[:] = core_attn_out_non_spec + core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[ + 0, :num_actual_tokens + ] From 5c9ad138d507320f6432cfc3d727980853fd5e91 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Thu, 13 Nov 2025 16:14:13 +0800 Subject: [PATCH 003/578] [Frontend] supports interleaved thinking (#28531) Signed-off-by: chaunceyjiang --- docs/features/interleaved_thinking.md | 118 ++++++++++++++++++ ...penai_chat_completion_client_with_tools.py | 1 + vllm/entrypoints/chat_utils.py | 17 ++- 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 docs/features/interleaved_thinking.md diff --git a/docs/features/interleaved_thinking.md b/docs/features/interleaved_thinking.md new file mode 100644 index 000000000000..7343324b4849 --- /dev/null +++ b/docs/features/interleaved_thinking.md @@ -0,0 +1,118 @@ +# Interleaved Thinking + +## Introduction + +Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results. + +Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature. + +## How Interleaved Thinking Works + +With interleaved thinking, the model can: + +- Reason about the results of a tool call before deciding what to do next +- Chain multiple tool calls with reasoning steps in between +- Make more nuanced decisions based on intermediate results +- Provide transparent reasoning for its tool selection process + +## Supported Models + +vLLM currently supports the following interleaved thinking models: + +| Model Series | Reasoning Parser Name | +|--------------|-----------------------| +| moonshotai/Kimi-K2-Thinking | kimi_k2 | +| MiniMaxAI/MiniMax-M2 | minimax_m2 | + +## Example Usage + +To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example: + +??? code + + ```python + """ + vllm serve MiniMaxAI/MiniMax-M2 \ + --tensor-parallel-size 4 \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2 \ + --enable-auto-tool-choice + """ + import json + + from openai import OpenAI + + client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") + + + def get_current_weather(location: str, unit: "str"): + """Get the current weather in a given location""" + if unit == "celsius": + return f"The current temperature in {location} is 22°C." + else: + return f"The current temperature in {location} is 72°F." + + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and state, e.g., 'San Francisco, CA'", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location", "unit"], + }, + }, + } + ] + messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}] + response = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=messages, + tools=tools, + tool_choice="auto", + ) + + tool_call = response.choices[0].message.tool_calls[0].function + + messages.append( + { + "role": "assistant", + "tool_calls": response.choices[0].message.tool_calls, + "reasoning": response.choices[0].message.reasoning, # append reasoning + } + ) + + # Simulate tool execution + available_tools = {"get_weather": get_current_weather} + + completion_tool_calls = response.choices[0].message.tool_calls + for call in completion_tool_calls: + tool_to_call = available_tools[call.function.name] + args = json.loads(call.function.arguments) + result = tool_to_call(**args) + messages.append( + { + "role": "tool", + "content": result, + "tool_call_id": call.id, + "name": call.function.name, + } + ) + response_2 = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=messages, + tools=tools, + tool_choice="auto", + ) + print(response_2.choices[0].message.content) + ``` +This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response. diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 41dbb3236297..0bd1d05322f8 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -161,6 +161,7 @@ def main(): { "role": "assistant", "tool_calls": chat_completion.choices[0].message.tool_calls, + "reasoning": chat_completion.choices[0].message.reasoning, } ) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index d7d6419d643b..3b722c2d9277 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" + reasoning: str | None + """The reasoning content for interleaved thinking.""" + ChatCompletionMessageParam: TypeAlias = ( OpenAIChatCompletionMessageParam @@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False): tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" + reasoning: str | None + """The reasoning content for interleaved thinking.""" + + reasoning_content: str | None + """Deprecated: The reasoning content for interleaved thinking.""" + # Passed in by user ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] @@ -1374,7 +1383,7 @@ def _parse_chat_message_content( ) -> list[ConversationMessage]: role = message["role"] content = message.get("content") - + reasoning = message.get("reasoning") or message.get("reasoning_content") if content is None: content = [] elif isinstance(content, str): @@ -1396,6 +1405,12 @@ def _parse_chat_message_content( # follow the OpenAI spec. if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None: result_msg["tool_calls"] = list(parsed_msg["tool_calls"]) + # Include reasoning if present for interleaved thinking. + if reasoning is not None: + result_msg["reasoning"] = cast(str, reasoning) + result_msg["reasoning_content"] = cast( + str, reasoning + ) # keep compatibility elif role == "tool": parsed_msg = _ToolParser(message) if "tool_call_id" in parsed_msg: From 11ac9ddd037c63a8c9404cd1f62f9f81a5f38652 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 12 Nov 2025 22:57:20 -1000 Subject: [PATCH 004/578] Support all interleaved layer types (#28485) Signed-off-by: Yong Hoon Shin --- vllm/transformers_utils/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 14cae2b168e1..b7418cfb7cc7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -472,8 +472,7 @@ def is_interleaved(config: PretrainedConfig) -> bool: """ text_config = config.get_text_config() if layer_types := getattr(text_config, "layer_types", None): - interleaved_types = {"full_attention", "sliding_attention"} - return interleaved_types.issubset(layer_types) + return len(set(layer_types)) > 1 return False From e63fd445605b442a81a4eb2f402206cc337ab8dd Mon Sep 17 00:00:00 2001 From: Di Wu <95495325+dw2761@users.noreply.github.com> Date: Thu, 13 Nov 2025 18:57:44 +0800 Subject: [PATCH 005/578] Fix: Correctly filter special tokens in benchmark_prefix_caching (#28615) Signed-off-by: Di Wu --- benchmarks/benchmark_prefix_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 146c268a6b7f..28fc383a318d 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: # Remove the special tokens. return random.choices( - [v for k, v in vocab.items() if k not in all_special_ids], + [v for v in vocab.values() if v not in all_special_ids], k=length, ) From 5e973209aaf5fa15459555eaa42bcd20ea63aa0d Mon Sep 17 00:00:00 2001 From: Zijing Liu Date: Thu, 13 Nov 2025 03:30:04 -0800 Subject: [PATCH 006/578] [BugFix] Fix type error when assign a trition kernel tensor to a torch.nn.Parameter (#28603) Signed-off-by: Zijing Liu --- vllm/model_executor/layers/quantization/mxfp4.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 0f69a18a1f3f..5552c1ae5edf 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -755,8 +755,8 @@ def _interleave_mxfp4_cutlass_sm90(w): self.w13_weight = w13_weight self.w2_weight = w2_weight - layer.w13_weight = w13_weight - layer.w2_weight = w2_weight + layer.w13_weight = Parameter(w13_weight.data, requires_grad=False) + layer.w2_weight = Parameter(w2_weight.data, requires_grad=False) else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") From c428e8d80b2bc17b0a306d1e80c8e4567b9dd9f4 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Thu, 13 Nov 2025 06:34:14 -0500 Subject: [PATCH 007/578] Fix io processor pooling #28273 (#28484) Signed-off-by: baonudesifeizhai --- vllm/entrypoints/openai/serving_pooling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 0eade272111f..ee4c5c8bacaa 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -4,7 +4,7 @@ import asyncio import json import time -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Sequence from typing import Final, cast import jinja2 @@ -122,6 +122,10 @@ async def create_pooling( engine_prompts = await self.io_processor.pre_process_async( prompt=validated_prompt, request_id=request_id ) + if not isinstance(engine_prompts, Sequence) or isinstance( + engine_prompts, (str, bytes, bytearray) + ): + engine_prompts = [engine_prompts] elif isinstance(request, PoolingChatRequest): error_check_ret = self._validate_chat_template( From c47b6c85ac25ecb0a26dfff76c70a0b1a9a4a6bf Mon Sep 17 00:00:00 2001 From: zofia <110436990+zufangzhu@users.noreply.github.com> Date: Thu, 13 Nov 2025 19:35:04 +0800 Subject: [PATCH 008/578] [XPU] add sym params to IPEXConfig (#28611) Signed-off-by: Zhu, Zufang --- .../layers/quantization/ipex_quant.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index e0234191c62b..5ca9167faec8 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -52,6 +52,7 @@ def __init__( modules_to_not_convert: list[str] | None = None, desc_act: bool | None = None, lm_head_quantized: bool | None = None, + is_sym: bool | None = None, ) -> None: super().__init__() self.method = method @@ -60,6 +61,7 @@ def __init__( self.modules_to_not_convert = modules_to_not_convert or [] self.desc_act = desc_act self.lm_head_quantized = lm_head_quantized + self.is_sym = is_sym self.pack_factor = 32 // self.weight_bits if self.weight_bits not in [4]: @@ -108,15 +110,25 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig": modules_to_not_convert = cls.get_from_keys_or( config, ["modules_to_not_convert"], None ) + is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False) return cls( - method, weight_bits, group_size, modules_to_not_convert, False, False + method, + weight_bits, + group_size, + modules_to_not_convert, + False, + False, + is_sym, ) # otherwise for gptq weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False) - return cls(method, weight_bits, group_size, [], desc_act, lm_head_quantized) + is_sym = cls.get_from_keys_or(config, ["sym"], default=True) + return cls( + method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym + ) @classmethod def override_quantization_method( @@ -180,6 +192,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # The float activation will be quantized (dynamic, per-token) to INT8. act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK + assert isinstance(self.quant_config, IPEXConfig) qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype, lowp_mode=lowp_mode, @@ -200,6 +213,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: bias=bias, group_size=self.quant_config.group_size, quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"], + weight_qscheme="sym" if self.quant_config.is_sym else "asym", ) ) @@ -250,6 +264,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # The float activation will be quantized (dynamic, per-token) to INT8. act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH + assert isinstance(self.quant_config, IPEXConfig) qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype, lowp_mode=lowp_mode, @@ -269,6 +284,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: bias=bias, group_size=self.quant_config.group_size, quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"], # type: ignore + weight_qscheme="sym" if self.quant_config.is_sym else "asym", ) ) From c9fe6abe7c0b03d552420edd63c6c678ed683dea Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Thu, 13 Nov 2025 21:06:06 +0800 Subject: [PATCH 009/578] [Bugfix] Fix FPS value type for Qwen2.5-Omni video processing (#28630) Signed-off-by: Lin, Fanli --- examples/offline_inference/vision_language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 371cf6309a67..624de2a2debc 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1536,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str): mm_processor_kwargs={ "min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28, - "fps": [1], + "fps": 1, }, limit_mm_per_prompt={modality: 1}, ) From 86d15bfd8d681a2ca2f3b2e550149a5ba3282ef1 Mon Sep 17 00:00:00 2001 From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com> Date: Thu, 13 Nov 2025 19:02:21 +0530 Subject: [PATCH 010/578] [Hardware][PowerPC] Fix fp16 compilation error for Power in cpu attention backend and bump oneDNN version (#28535) Signed-off-by: Akash Kaothalkar Co-authored-by: Akash Kaothalkar --- cmake/cpu_extension.cmake | 4 ++-- csrc/cpu/cpu_attn_impl.hpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index bb0179c79c10..aa84125818d1 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -242,7 +242,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild" SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src" GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git - GIT_TAG v52.2.0 + GIT_TAG v52.6.0 GIT_SHALLOW TRUE GIT_PROGRESS TRUE ) @@ -310,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.9 + GIT_TAG v3.10 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 8f4c78099802..c317453530af 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -821,10 +821,12 @@ struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; +#if !defined(__powerpc__) template <> struct VecTypeTrait { using vec_t = vec_op::FP16Vec16; }; +#endif template void print_logits(const char* name, T* ptr, int32_t row, int32_t col, From 8da2f28f53c14e2c21c50821d89e3909d9c84af6 Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Thu, 13 Nov 2025 22:18:20 +0800 Subject: [PATCH 011/578] [ROCm][BugFix]Fix `get_cu_count` in rocm_aiter_fa.py (#28618) Signed-off-by: ganyi --- vllm/v1/attention/backends/rocm_aiter_fa.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index c7f925817a6a..ad454daa582e 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -18,6 +18,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +from vllm.utils.platform_utils import get_cu_count from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -38,7 +39,7 @@ def block_size(x, head_dim): return min(65536 // x.element_size(), triton.next_power_of_2(head_dim)) def num_programs(total_tokens): - return min(total_tokens, current_platform.get_cu_count()) + return min(total_tokens, get_cu_count()) @triton.jit def cp_mha_gather_cache_kernel( From a7791eac9d29a4a26b007db42130a9e28b3e77ee Mon Sep 17 00:00:00 2001 From: amdfaa <107946068+amdfaa@users.noreply.github.com> Date: Thu, 13 Nov 2025 09:34:55 -0500 Subject: [PATCH 012/578] [CI/Build] Install uv for AMD MI300: Language Models Tests (Hybrid) %N (#28142) Signed-off-by: amdfaa <107946068+amdfaa@users.noreply.github.com> Signed-off-by: zhewenli Co-authored-by: zhewenli --- docker/Dockerfile.rocm | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 06d229f315bd..137452cad2c1 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -15,6 +15,20 @@ RUN apt-get update -q -y && apt-get install -q -y \ # Remove sccache RUN python3 -m pip install --upgrade pip RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" + +# Install UV +RUN curl -LsSf https://astral.sh/uv/install.sh | sh + +# Activate virtual environment and add uv to PATH +ENV PATH="/root/.local/bin:$PATH" + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + ARG COMMON_WORKDIR WORKDIR ${COMMON_WORKDIR} @@ -59,13 +73,15 @@ FROM base AS test RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* -# Install vLLM +# Install vLLM using uv (inherited from base stage) +# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + --mount=type=cache,target=/root/.cache/uv \ cd /install \ - && pip install -U -r requirements/rocm.txt \ - && pip install -U -r requirements/rocm-test.txt \ + && uv pip install --system -r requirements/rocm.txt \ + && uv pip install --system -r requirements/rocm-test.txt \ && pip uninstall -y vllm \ - && pip install *.whl + && uv pip install --system *.whl WORKDIR /vllm-workspace ARG COMMON_WORKDIR @@ -89,14 +105,17 @@ RUN case "$(which python3)" in \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ *) ;; esac -RUN python3 -m pip install --upgrade huggingface-hub[cli] +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system --upgrade huggingface-hub[cli] -# Install vLLM +# Install vLLM using uv (inherited from base stage) +# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + --mount=type=cache,target=/root/.cache/uv \ cd /install \ - && pip install -U -r requirements/rocm.txt \ + && uv pip install --system -r requirements/rocm.txt \ && pip uninstall -y vllm \ - && pip install *.whl + && uv pip install --system *.whl ARG COMMON_WORKDIR From 07a606aa7eb30923a3cc631185d93de9e51b37cb Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Thu, 13 Nov 2025 07:11:27 -0800 Subject: [PATCH 013/578] [CI Failure] Fix backend selection for encoder-only models (#28534) Signed-off-by: Huamin Li <3ericli@gmail.com> --- vllm/attention/backends/abstract.py | 14 ++++++++++++++ vllm/attention/layer.py | 1 + vllm/attention/layers/encoder_only_attention.py | 6 +++++- vllm/attention/selector.py | 5 +++++ vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 10 ++++++++++ vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 1 + vllm/platforms/xpu.py | 1 + vllm/v1/attention/backends/cpu_attn.py | 11 +++++++++++ vllm/v1/attention/backends/flash_attn.py | 12 ++++++++++++ vllm/v1/attention/backends/flex_attention.py | 7 +++++++ vllm/v1/attention/backends/mla/flashmla_sparse.py | 10 +++++----- 14 files changed, 75 insertions(+), 6 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 697beed91869..9275d70fd86a 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -142,6 +142,17 @@ def supports_sink(cls) -> bool: def is_sparse(cls) -> bool: return False + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """Check if backend supports a given attention type. + + By default, only supports decoder attention. + Backends should override this to support other attention types. + """ + from vllm.attention import AttentionType + + return attn_type == AttentionType.DECODER + @classmethod def supports_compute_capability(cls, capability: "DeviceCapability") -> bool: return True @@ -171,6 +182,7 @@ def validate_configuration( has_sink: bool, use_sparse: bool, device_capability: "DeviceCapability", + attn_type: str, ) -> list[str]: invalid_reasons = [] if not cls.supports_head_size(head_size): @@ -195,6 +207,8 @@ def validate_configuration( invalid_reasons.append("non-sparse not supported") if not cls.supports_compute_capability(device_capability): invalid_reasons.append("compute capability not supported") + if not cls.supports_attn_type(attn_type): + invalid_reasons.append(f"attention type {attn_type} not supported") combination_reason = cls.supports_combination( head_size, dtype, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 487bba76babf..37f9a4b383ce 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -291,6 +291,7 @@ def __init__( block_size, use_mla=False, has_sink=self.has_sink, + attn_type=attn_type, ) else: self.attn_backend = attn_backend diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py index 4929bbf5efc7..5e99c9901003 100644 --- a/vllm/attention/layers/encoder_only_attention.py +++ b/vllm/attention/layers/encoder_only_attention.py @@ -74,7 +74,11 @@ def __init__( block_size = 16 underlying_attn_backend = get_attn_backend( - head_size, dtype, kv_cache_dtype, block_size + head_size, + dtype, + kv_cache_dtype, + block_size, + attn_type=AttentionType.ENCODER_ONLY, ) attn_backend = create_encoder_only_attention_backend(underlying_attn_backend) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 262cdf0e575b..1a092db9ce37 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -76,6 +76,7 @@ def get_attn_backend( use_mla: bool = False, has_sink: bool = False, use_sparse: bool = False, + attn_type: str | None = None, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" @@ -94,6 +95,7 @@ def get_attn_backend( use_mla=use_mla, has_sink=has_sink, use_sparse=use_sparse, + attn_type=attn_type, ) @@ -106,6 +108,7 @@ def _cached_get_attn_backend( use_mla: bool = False, has_sink: bool = False, use_sparse: bool = False, + attn_type: str | None = None, ) -> type[AttentionBackend]: # Check whether a particular choice of backend was # previously forced. @@ -159,6 +162,7 @@ def _cached_get_attn_backend( use_mla, has_sink, use_sparse, + attn_type, ) else: attention_cls = current_platform.get_attn_backend_cls( @@ -170,6 +174,7 @@ def _cached_get_attn_backend( use_mla, has_sink, use_sparse, + attn_type, ) if not attention_cls: raise ValueError( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 8b3b8d4cb44f..cf954768689f 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -134,6 +134,7 @@ def get_attn_backend_cls( use_mla: bool, has_sink: bool, use_sparse: bool, + attn_type: str | None = None, ) -> str: from vllm.attention.backends.registry import AttentionBackendEnum diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index ebcc290a64cd..2e4dd8bb808b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -298,6 +298,7 @@ def get_valid_backends( has_sink, use_sparse, device_capability, + attn_type, ) -> tuple[ list[tuple["AttentionBackendEnum", int]], dict["AttentionBackendEnum", list[str]], @@ -318,6 +319,7 @@ def get_valid_backends( has_sink, use_sparse, device_capability, + attn_type, ) except ImportError: invalid_reasons_i = ["ImportError"] @@ -339,7 +341,13 @@ def get_attn_backend_cls( use_mla: bool, has_sink: bool, use_sparse: bool, + attn_type: str | None = None, ) -> str: + from vllm.attention import AttentionType + + if attn_type is None: + attn_type = AttentionType.DECODER + device_capability = cls.get_device_capability() assert device_capability is not None @@ -356,6 +364,7 @@ def get_attn_backend_cls( has_sink, use_sparse, device_capability, + attn_type, ) except ImportError: invalid_reasons = ["ImportError"] @@ -379,6 +388,7 @@ def get_attn_backend_cls( has_sink, use_sparse, device_capability, + attn_type, ) reasons_str = ( "{" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 12c377384270..0471c20429b1 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -222,6 +222,7 @@ def get_attn_backend_cls( use_mla: bool, has_sink: bool, use_sparse: bool, + attn_type: str | None = None, ) -> str: """Get the attention backend class of a device.""" return "" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d20dc9e6b067..788f9d69c357 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -216,6 +216,7 @@ def get_attn_backend_cls( use_mla, has_sink, use_sparse, + attn_type: str | None = None, ) -> str: from vllm._aiter_ops import rocm_aiter_ops from vllm.attention.backends.registry import AttentionBackendEnum diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 4773fef6829d..b997bb9e6999 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -61,6 +61,7 @@ def get_attn_backend_cls( use_mla: bool, has_sink, use_sparse, + attn_type: str | None = None, ) -> str: from vllm.attention.backends.registry import AttentionBackendEnum diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c629325f76a3..5552e4ca4b2f 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -51,6 +51,7 @@ def get_attn_backend_cls( use_mla: bool, has_sink: bool, use_sparse, + attn_type: str | None = None, ) -> str: from vllm.v1.attention.backends.utils import set_kv_cache_layout diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 674398e19c4c..f1254352c058 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -48,6 +48,17 @@ def get_supported_head_sizes(cls) -> list[int]: def get_name() -> str: return "CPU_ATTN" + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """CPU attention supports decoder and encoder-only attention.""" + from vllm.attention import AttentionType + + return attn_type in ( + AttentionType.DECODER, + AttentionType.ENCODER, + AttentionType.ENCODER_ONLY, + ) + @staticmethod def get_impl_cls() -> type["CPUAttentionBackendImpl"]: return CPUAttentionBackendImpl diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index d9bd52d8f980..bfb4a45c2b56 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -66,6 +66,18 @@ class FlashAttentionBackend(AttentionBackend): def get_name() -> str: return "FLASH_ATTN" + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """FlashAttention supports all attention types.""" + from vllm.attention import AttentionType + + return attn_type in ( + AttentionType.DECODER, + AttentionType.ENCODER, + AttentionType.ENCODER_ONLY, + AttentionType.ENCODER_DECODER, + ) + @staticmethod def get_impl_cls() -> type["FlashAttentionImpl"]: return FlashAttentionImpl diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index e53cd0d8af4f..7768827d26dc 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -84,6 +84,13 @@ class FlexAttentionBackend(AttentionBackend): def get_name() -> str: return "FLEX_ATTENTION" + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """FlexAttention supports both decoder and encoder-only attention.""" + from vllm.attention import AttentionType + + return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) + @staticmethod def get_impl_cls() -> type["FlexAttentionImpl"]: return FlexAttentionImpl diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 5fe9c69d3500..bb8d914d1571 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -40,14 +40,14 @@ """ NOTE: FlashMLA Sparse uses an fp8 cache with the following format -In the "FP8 with scale" format, each token's KV cache is 656 Bytes, +In the "FP8 with scale" format, each token's KV cache is 656 Bytes, structured as: -- **First 512 bytes:** The "quantized NoPE" part, containing 512 +- **First 512 bytes:** The "quantized NoPE" part, containing 512 `float8_e4m3` values. -- **Next 16 bytes:** Scale factors, containing 4 `float32` values. - The first `float32` is the scale for the first 128 `float8_e4m3` values, +- **Next 16 bytes:** Scale factors, containing 4 `float32` values. + The first `float32` is the scale for the first 128 `float8_e4m3` values, the second for the next 128, and so on. -- **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This +- **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This part is not quantized for accuracy. """ From 3035d1a166821272d4e7eb204e2c613bb02bacd7 Mon Sep 17 00:00:00 2001 From: Yuanping Song Date: Thu, 13 Nov 2025 10:24:35 -0500 Subject: [PATCH 014/578] [BugFix] DeepSeek-OCR: apply NoRepeatNGramLogitsProcessor to greedy path (#28617) Signed-off-by: Yuanping Song --- vllm/model_executor/models/deepseek_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index c89caab93a1e..8179f916ff41 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -161,7 +161,7 @@ def validate_params(cls, params: SamplingParams): ) def is_argmax_invariant(self) -> bool: - return True + return False def new_req_logits_processor( self, From b230286fbc0b6d192e176ead55000471fd4f1080 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:02:42 +0000 Subject: [PATCH 015/578] Fix `get_num_experts` when config sets it explicitly to `None` (#28652) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: bruceszchen --- vllm/config/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index c47b619118ff..f4ed99689e5b 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1342,7 +1342,8 @@ def get_num_experts(self) -> int: # Ernie VL's remote code uses list[int]... # The values are always the same so we just take the first one. return num_experts[0] - return num_experts + # Coerce to 0 if explicitly set to None + return num_experts or 0 def get_layers_start_end_indices( self, parallel_config: ParallelConfig From d3387750f191f3bcf6607db95436147bbccfacb3 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 13 Nov 2025 08:38:08 -0800 Subject: [PATCH 016/578] [Misc] Turn off encoder torch compile by default (#28634) Signed-off-by: Roger Wang --- tests/compile/test_multimodal_compile.py | 9 ++++++--- tests/models/multimodal/generation/test_common.py | 2 ++ vllm/config/compilation.py | 5 +++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py index b76c29819a2d..621f6a51a918 100644 --- a/tests/compile/test_multimodal_compile.py +++ b/tests/compile/test_multimodal_compile.py @@ -10,8 +10,8 @@ def test_compile(): vllm_config = VllmConfig() - # Default configuration compiles mm encoder - assert vllm_config.compilation_config.compile_mm_encoder + # Default configuration does not compile mm encoder + assert not vllm_config.compilation_config.compile_mm_encoder # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): "Qwen/Qwen2.5-VL-3B-Instruct", max_model_len=2048, gpu_memory_utilization=0.8, - compilation_config={"mode": CompilationMode.VLLM_COMPILE}, + compilation_config={ + "mode": CompilationMode.VLLM_COMPILE, + "compile_mm_encoder": True, + }, ) as _, ): pass diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 5504c417fda4..22083d9f1614 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -131,6 +131,7 @@ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", + enforce_eager=False, max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, @@ -160,6 +161,7 @@ VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO, ), + enforce_eager=False, needs_video_metadata=True, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index b0d1bc2bab30..10673041aa68 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -320,9 +320,10 @@ class CompilationConfig: If None, defaults to attention ops for piecewise cudagraphs. If empty list [], no ops are excluded (suitable for full cudagraphs).""" - compile_mm_encoder: bool = True + compile_mm_encoder: bool = False """Whether or not to compile the multimodal encoder. - Currently, this only works for `Qwen2_5_vl`.""" + Currently, this only works for `Qwen2_5_vl` on selected platforms. + Disabled by default until more models are supported/tested to work.""" # Inductor capture use_inductor: bool | None = None From 06c4873d959feb0d4cb062ef17cdd0dd09dbf10f Mon Sep 17 00:00:00 2001 From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:52:50 -0500 Subject: [PATCH 017/578] Rewrite C++ meta funcs to Python (#28595) Signed-off-by: Jane Xu --- .../gptq_marlin/awq_marlin_repack.cu | 16 -------- .../gptq_marlin/gptq_marlin_repack.cu | 16 -------- vllm/_custom_ops.py | 39 ++++++++++++++++++- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu index 8ba617a9e655..e607107b3e77 100644 --- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu +++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu @@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, return out; } -torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight, - c10::SymInt size_k, c10::SymInt size_n, - int64_t num_bits) { - int const pack_factor = 32 / num_bits; - auto options = torch::TensorOptions() - .dtype(b_q_weight.dtype()) - .device(b_q_weight.device()); - return torch::empty_symint( - {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor}, - options); -} - TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("awq_marlin_repack", &awq_marlin_repack); } - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) { - m.impl("awq_marlin_repack", &awq_marlin_repack_meta); -} diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu index 7c2d089a70d9..ad80d51ece94 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu @@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, return out; } -torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight, - torch::Tensor& perm, c10::SymInt size_k, - c10::SymInt size_n, int64_t num_bits) { - int const pack_factor = 32 / num_bits; - auto options = torch::TensorOptions() - .dtype(b_q_weight.dtype()) - .device(b_q_weight.device()); - return torch::empty_symint( - {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor}, - options); -} - TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("gptq_marlin_repack", &gptq_marlin_repack); } - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) { - m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta); -} diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 7d70c01cefbb..096266c9764e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1174,13 +1174,50 @@ def gptq_marlin_repack( return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits) -# gptq_marlin +if hasattr(torch.ops._C, "gptq_marlin_repack"): + + @register_fake("_C::gptq_marlin_repack") + def _gptq_marlin_repack_fake( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: torch.SymInt, + size_n: torch.SymInt, + num_bits: int, + ) -> torch.Tensor: + pack_factor = 32 // num_bits + marlin_tile_size = 16 + return torch.empty( + (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor), + dtype=b_q_weight.dtype, + device=b_q_weight.device, + ) + + +# awq_marlin def awq_marlin_repack( b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int ) -> torch.Tensor: return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits) +if hasattr(torch.ops._C, "awq_marlin_repack"): + + @register_fake("_C::awq_marlin_repack") + def _awq_marlin_repack_fake( + b_q_weight: torch.Tensor, + size_k: torch.SymInt, + size_n: torch.SymInt, + num_bits: int, + ) -> torch.Tensor: + pack_factor = 32 // num_bits + marlin_tile_size = 16 + return torch.empty( + (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor), + dtype=b_q_weight.dtype, + device=b_q_weight.device, + ) + + def gptq_marlin_moe_repack( b_q_weight: torch.Tensor, perm: torch.Tensor, From 327c0a9a23f2939923d02fbf882640753bf1e030 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 13 Nov 2025 09:14:08 -0800 Subject: [PATCH 018/578] [BugFix] Ensure `EngineArgs.create_engine_config` is idempotent (#28515) Signed-off-by: Nick Hill --- vllm/engine/arg_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 13c7704f5bf3..ca7f5e5e3e05 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1631,40 +1631,39 @@ def create_engine_config( ) observability_config = ObservabilityConfig( - show_hidden_metrics_for_version=(self.show_hidden_metrics_for_version), + show_hidden_metrics_for_version=self.show_hidden_metrics_for_version, otlp_traces_endpoint=self.otlp_traces_endpoint, collect_detailed_traces=self.collect_detailed_traces, ) # Compilation config overrides + compilation_config = copy.deepcopy(self.compilation_config) if self.cuda_graph_sizes is not None: logger.warning( "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or " "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes " "instead." ) - if self.compilation_config.cudagraph_capture_sizes is not None: + if compilation_config.cudagraph_capture_sizes is not None: raise ValueError( "cuda_graph_sizes and compilation_config." "cudagraph_capture_sizes are mutually exclusive" ) - self.compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes + compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes if self.cudagraph_capture_sizes is not None: - if self.compilation_config.cudagraph_capture_sizes is not None: + if compilation_config.cudagraph_capture_sizes is not None: raise ValueError( "cudagraph_capture_sizes and compilation_config." "cudagraph_capture_sizes are mutually exclusive" ) - self.compilation_config.cudagraph_capture_sizes = ( - self.cudagraph_capture_sizes - ) + compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes if self.max_cudagraph_capture_size is not None: - if self.compilation_config.max_cudagraph_capture_size is not None: + if compilation_config.max_cudagraph_capture_size is not None: raise ValueError( "max_cudagraph_capture_size and compilation_config." "max_cudagraph_capture_size are mutually exclusive" ) - self.compilation_config.max_cudagraph_capture_size = ( + compilation_config.max_cudagraph_capture_size = ( self.max_cudagraph_capture_size ) @@ -1679,7 +1678,7 @@ def create_engine_config( load_config=load_config, structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, - compilation_config=self.compilation_config, + compilation_config=compilation_config, kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, ec_transfer_config=self.ec_transfer_config, From fdfd5075aa0b9b32e3000554d719f1622acff800 Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Thu, 13 Nov 2025 09:36:54 -0800 Subject: [PATCH 019/578] [TPU] patch TPU wheel build script to resolve metadata issue (#27279) Signed-off-by: Johnny Yang --- setup.py | 4 +++- tools/vllm-tpu/build.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0934a8608eb1..e9b36e2a2e03 100644 --- a/setup.py +++ b/setup.py @@ -545,7 +545,9 @@ def get_vllm_version() -> str: # Allow overriding the version. This is useful to build platform-specific # wheels (e.g. CPU, TPU) without modifying the source. if env_version := os.getenv("VLLM_VERSION_OVERRIDE"): - return env_version + print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE") + os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version + return get_version(write_to="vllm/_version.py") version = get_version(write_to="vllm/_version.py") sep = "+" if "+" not in version else "." # dev versions might contain + diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh index fbc91e379df3..45ef8dfcb1db 100755 --- a/tools/vllm-tpu/build.sh +++ b/tools/vllm-tpu/build.sh @@ -7,6 +7,13 @@ TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources +CHANGE_FILE_LIST=( + "vllm/entrypoints/cli/main.py" + "vllm/entrypoints/cli/run_batch.py" + "vllm/utils/__init__.py" + "vllm/platforms/__init__.py" +) + # Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "." if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then echo "Error: This script should not be run from the vllm directory directly if using relative paths." @@ -30,6 +37,20 @@ if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then echo "Patching pyproject.toml project name to vllm-tpu..." cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak" sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE" + + echo "Patching ${CHANGE_FILE_LIST[@]} vllm to vllm-tpu..." + # patching + # importlib.metadata.version('vllm') -> importlib.metadata.version('vllm-tpu') + # importlib.metadata.version("vllm") -> importlib.metadata.version("vllm-tpu") + # importlib.metadata.metadata('vllm') -> importlib.metadata.metadata('vllm-tpu') + # importlib.metadata.metadata("vllm") -> importlib.metadata.metadata("vllm-tpu") + # version('vllm') -> version('vllm-tpu') + # version("vllm") -> version("vllm-tpu") + sed -i \ + -e "s/importlib.metadata.version(\(['\"]\)vllm\1)/importlib.metadata.version(\1vllm-tpu\1)/" \ + -e "s/importlib.metadata.metadata(\(['\"]\)vllm\1)/importlib.metadata.metadata(\1vllm-tpu\1)/" \ + -e "s/version(\(['\"]\)vllm\1)/version(\1vllm-tpu\1)/" \ + "${CHANGE_FILE_LIST[@]}" PATCHED=true else PATCHED=false @@ -45,6 +66,13 @@ cleanup() { echo "Restoring original pyproject.toml..." cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE" rm -f "${PYPROJECT_FILE}.bak" + + echo "Restoring vllm code..." + sed -i \ + -e "s/importlib.metadata.version(\(['\"]\)vllm-tpu\1)/importlib.metadata.version(\1vllm\1)/" \ + -e "s/importlib.metadata.metadata(\(['\"]\)vllm-tpu\1)/importlib.metadata.metadata(\1vllm\1)/" \ + -e "s/version(\(['\"]\)vllm-tpu\1)/version(\1vllm\1)/" \ + "${CHANGE_FILE_LIST[@]}" fi } trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals From fe1cd7704ddd3266ddc97181ab24a167b3c9223c Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 13 Nov 2025 13:16:55 -0500 Subject: [PATCH 020/578] [Performance][B200] silu_mul_quant: pack scales in int32 (#28358) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- csrc/quantization/activation_kernels.cu | 164 ++++++--- tests/conftest.py | 13 + tests/kernels/moe/test_deepep_deepgemm_moe.py | 18 +- tests/kernels/moe/test_deepep_moe.py | 2 +- .../moe/test_silu_mul_fp8_quant_deep_gemm.py | 311 +++++++++++++----- .../layers/fused_moe/batched_deep_gemm_moe.py | 76 +++-- vllm/utils/deep_gemm.py | 23 ++ 7 files changed, 461 insertions(+), 146 deletions(-) diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 2521b2797e2c..0c3bcf3b64b2 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens, } template + typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0, + int GROUP_SIZE = 128, int NUM_STAGES = 3> __global__ void silu_mul_fp8_quant_deep_gemm_kernel( const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q, - float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert, + scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert, // sizes Idx_t E, Idx_t T, Idx_t H, // strides (in elements) Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e, Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t, - Idx_t stride_ys_g, Idx_t stride_counts_e) { + Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) { #ifndef USE_ROCM static constexpr int NUM_WARPS = THREADS / WARP_SIZE; @@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( __nv_fp8x4_e4m3* y_q_base_ptr = reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id; - auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g; + + Idx_t scale_group_offset = 0; + if constexpr (std::is_same::value) { + // packed int32_t format + int pack_id = warp_position_scales / 4; + int scale_in_pack = warp_position_scales % 4; + scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g; + } else { + scale_group_offset = warp_position_scales * stride_ys_g; + } + + scale_t* const y_scale_base_ptr = _y_s + scale_group_offset; for (auto j = tokens_lower; j < tokens_upper; j++) { + int current_group_id = warp_position_scales; // Running count of which + // group is being processed const Idx_t base_ys = expert_id * stride_ys_e; auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t; __nv_fp8x4_e4m3* y_q_ptr = @@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv); - if constexpr (USE_UE8M0) { + if constexpr (CEIL_UE8M0) { y_s = hexp2(hceil(hlog2(y_s))); } @@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( y_q_ptr += WARP_SIZE * stride_yq_h; if (!lane_id) { - *y_s_ptr = y_s; - y_s_ptr += stride_ys_g; + // Store scales. + if constexpr (std::is_same::value) { + // Packed UE8MO format. Remove Mantissa. + *y_s_ptr = reinterpret_cast(y_s) >> 7; + + bool const jump_pack = (current_group_id + 1) % 4 == 0; + // Minus 3 because we need to get to the first group in the + // next pack. + y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g; + + } else { + // float32 format + static_assert(std::is_same::value); + *y_s_ptr = y_s; + y_s_ptr += stride_ys_g; + } + + current_group_id += 1; } } } @@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant( const at::Tensor& tokens_per_expert, // (E) at::Tensor& y_q, // (E, T, H) [OUT] at::Tensor& y_s, // (E, T, H//group_size) [OUT] - bool use_ue8m0) { + bool cast_scale_ue8m0) { #ifndef USE_ROCM // This kernel currently only supports H % 128 == 0 and assumes a @@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant( TORCH_CHECK(input.dtype() == torch::kBFloat16); TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn || y_q.dtype() == torch::kFloat8_e4m3fnuz); - TORCH_CHECK(y_s.dtype() == torch::kFloat32); TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0); + bool const is_packed_ue8m0 = + (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0); + TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0); + using Idx_t = int64_t; Idx_t E = input.size(0); @@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant( Idx_t stride_yq_e = y_q.stride(0); Idx_t stride_yq_t = y_q.stride(1); Idx_t stride_yq_h = y_q.stride(2); - Idx_t stride_ys_e = y_s.stride(0); - Idx_t stride_ys_t = y_s.stride(1); - Idx_t stride_ys_g = y_s.stride(2); Idx_t stride_counts_e = tokens_per_expert.stride(0); + int const NUM_GROUPS = H / GROUP_SIZE; + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \ + // TODO: Get this from cuda_arch ? + static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32; + + #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \ + STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES) \ static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \ int sms = SILU_V2_BLOCK_COUNT; \ static constexpr int max_shared_mem_bytes = \ @@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant( VLLM_DISPATCH_FP8_TYPES( \ y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \ vllm::silu_mul_fp8_quant_deep_gemm_kernel< \ - BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \ - USE_UE8M0, GROUP_SIZE, STAGES> \ + BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \ + Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES> \ <<>>( \ reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ - (fp8_t*)y_q.data_ptr(), y_s.data_ptr(), \ + (fp8_t*)y_q.data_ptr(), \ + reinterpret_cast(y_s.data_ptr()), \ reinterpret_cast(tokens_per_expert.data_ptr()), E, \ T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \ - stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \ - stride_ys_g, stride_counts_e); \ + stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T, \ + STRIDE_YS_G, STRIDE_YS_P, stride_counts_e); \ }); - static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32; - - int const NUM_GROUPS = H / GROUP_SIZE; - if (!use_ue8m0) { - if (H >= 4096 && (NUM_GROUPS % 8 == 0)) { - /* 8 warps config */ - static constexpr int NUM_STAGES = 4; - static constexpr int THREAD_COUNT = 256; - KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES); - } else { - /* 1 warp config */ - static constexpr int THREAD_COUNT = 32; - KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2); - } - } else { - if (H >= 4096 && (NUM_GROUPS % 8 == 0)) { - /* 8 warps config */ - static constexpr int NUM_STAGES = 4; - static constexpr int THREAD_COUNT = 256; - KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES); - } else { - /* 1 warp config */ - static constexpr int THREAD_COUNT = 32; - KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2); + #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \ + STRIDE_YS_P, CEIL_UE8M0) \ + if (H >= 4096 && (NUM_GROUPS % 8) == 0) { \ + /* 8 warp config */ \ + static constexpr int NUM_STAGES = 4; \ + static constexpr int THREAD_COUNT = 256; \ + KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \ + STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \ + } else { \ + /* 1 warp config */ \ + static constexpr int THREAD_COUNT = 32; \ + KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \ + STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2); \ } + + Idx_t stride_ys_e = y_s.stride(0); + Idx_t stride_ys_t = y_s.stride(1); + Idx_t stride_ys_g = y_s.stride(2); + Idx_t stride_ys_p = 0; + if (!cast_scale_ue8m0) { + TORCH_CHECK(!is_packed_ue8m0); + LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p, + false); + return; + } + + if (!is_packed_ue8m0) { + // UE8M0 but not packed + LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p, + true); + return; } + TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0); + TORCH_CHECK(y_s.dtype() == torch::kInt32); + + // Int32 packed ue8m0 scales tensor. + // Let E, T, G be the number to experts, number of tokens and number of groups + // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales + // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected + // to be arranged as follows, + // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,], + // [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,] + // [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,] + // [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]] + // where, TxGy is the scale ue8m0 scale value of Token x, Group y. + // + // In memory (in bytes) the scale values are arranged as, + // [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4, + // T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5, + // X, X, T3G4, T3G5, X, X] + // + // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented + // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In + // english, ignoring the Experts dimension, the original int32 tensor is + // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32 + // tensor). The following strides setting reflects this change. Caveat: This + // means that the G dimension is no longer contiguous. i.e. Note that to move + // from G3 to G4, we need to jump along the packing dimension. The kernel + // handles this case. + + stride_ys_e *= sizeof(int32_t); + stride_ys_p = T * sizeof(int32_t); // Packing dimension + stride_ys_t = sizeof(int32_t); + stride_ys_g = 1; + + LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p, + true); + #endif } diff --git a/tests/conftest.py b/tests/conftest.py index 5e127e4e939e..b17081352edc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]: """Indirect fixture: takes a list of names, returns list of full URLs.""" names: list[str] = request.param return [local_asset_server.url_for(name) for name in names] + + +@pytest.fixture +def disable_deepgemm_ue8m0(monkeypatch): + from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used + + with monkeypatch.context() as monkeypatch_ctx: + monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0") + is_deep_gemm_e8m0_used.cache_clear() + yield + # Clear cache so the next time it is used it is processed with the + # default VLLM_USE_DEEP_GEMM_E8M0 setting. + is_deep_gemm_e8m0_used.cache_clear() diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 9d039b81690a..0faf8bc95d2e 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -21,7 +21,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform -from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported +from vllm.utils.deep_gemm import ( + get_mk_alignment_for_contiguous_layout, + is_deep_gemm_e8m0_used, + is_deep_gemm_supported, +) from vllm.utils.import_utils import has_deep_ep, has_deep_gemm from ...utils import multi_gpu_test @@ -413,19 +417,16 @@ def _test_deepep_deepgemm_moe( @multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif( - is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM" -) def test_ht_deepep_deepgemm_moe( mnk: tuple[int, int, int], num_experts: int, topk: int, world_dp_size: tuple[int, int], + disable_deepgemm_ue8m0, ): """ Tests for High-Throughput DeepEP + DeepGemm integration. """ - import deep_gemm m, n, k = mnk current_platform.seed_everything(7) @@ -433,7 +434,7 @@ def test_ht_deepep_deepgemm_moe( if topk > num_experts: pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") - block_m = deep_gemm.get_m_alignment_for_contiguous_layout() + block_m = get_mk_alignment_for_contiguous_layout()[0] block_size = [block_m, block_m] world_size, dp_size = world_dp_size @@ -487,9 +488,6 @@ def test_ht_deepep_deepgemm_moe( @multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif( - is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM" -) def test_ll_deepep_deepgemm_moe( mnk: tuple[int, int, int], num_experts: int, @@ -497,10 +495,12 @@ def test_ll_deepep_deepgemm_moe( use_fp8_dispatch: bool, block_size: list[int], world_dp_size: tuple[int, int], + disable_deepgemm_ue8m0, ): """ Tests for Low-Latency DeepEP + DeepGemm integration. """ + assert not is_deep_gemm_e8m0_used() m, n, k = mnk current_platform.seed_everything(7) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index b49319a7e6f5..d78b8250463a 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -294,7 +294,7 @@ def torch_moe_impl( # blockwise quant and de-quant. assert not per_act_token_quant a = test_tensors.rank_tokens - aq, aq_scale = per_token_group_quant_fp8(a, 128) + aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False) a = ( (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)) .view(a.shape) diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 420dbbffaac0..d6b78dd2c232 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import random + import pytest import torch @@ -8,27 +11,30 @@ persistent_masked_m_silu_mul_quant, ) from vllm.platforms import current_platform -from vllm.utils.math_utils import cdiv +from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm +from vllm.utils.math_utils import cdiv, round_up fp8_dtype = torch.float8_e4m3fn CASES = [ (1, 1, 128, fp8_dtype), - (1, 4, 128, fp8_dtype), - (2, 4, 256, fp8_dtype), - (32, 64, 256, fp8_dtype), - (17, 31, 768, fp8_dtype), - (1, 1, 128 * 1, fp8_dtype), - (1, 1, 128 * 3, fp8_dtype), - (1, 1, 128 * 4, fp8_dtype), - (8, 16, 128 * 1, fp8_dtype), - (8, 16, 128 * 2, fp8_dtype), - (8, 16, 128 * 3, fp8_dtype), + (1, 4, 128 * 1, fp8_dtype), + (2, 4, 128 * 2, fp8_dtype), + (1, 4, 128 * 3, fp8_dtype), + (8, 16, 128 * 4, fp8_dtype), + (8, 16, 128 * 5, fp8_dtype), + (8, 16, 128 * 6, fp8_dtype), + (8, 16, 128 * 7, fp8_dtype), + (8, 16, 128 * 8, fp8_dtype), + (8, 16, 128 * 9, fp8_dtype), (8, 64, 7168, fp8_dtype), (8, 128, 128 * 33, fp8_dtype), + (1, 4, 128 * 10, fp8_dtype), (8, 128, 7168, fp8_dtype), (8, 512, 7168, fp8_dtype), (8, 1024, 7168, fp8_dtype), + (17, 31, 768, fp8_dtype), + (32, 64, 256, fp8_dtype), (256, 8, 7168, fp8_dtype), (256, 32, 7168, fp8_dtype), (256, 64, 7168, fp8_dtype), @@ -38,14 +44,159 @@ ] +def as_uint8(x) -> torch.Tensor: + return ( + torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8) + ) + + +def silu(x: torch.Tensor) -> torch.Tensor: + one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32) + x_f32 = x.to(torch.float32) + act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32)) + assert act_f32.dtype == torch.float32 + return act_f32.to(torch.bfloat16) + + +def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool): + eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16) + one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16) + fp8_max_bf16 = torch.tensor( + [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16 + ) + fp8_min_bf16 = torch.tensor( + [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16 + ) + fp8_max_inv = one_bf16 / fp8_max_bf16 + assert fp8_max_inv.dtype == torch.bfloat16 + + assert x.size(-1) % group_size == 0 + num_groups = x.numel() // group_size + x_og_shape = x.shape + + x = x.to(torch.bfloat16) + x = x.view((-1, group_size)) + amax = x.abs().amax(dim=1).clamp(min=eps_bf16) + assert amax.dtype == torch.bfloat16 + s = amax * fp8_max_inv + + if ceil_ue8m0: + s = torch.exp2( + torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16) + ).to(torch.bfloat16) + + inv_s = one_bf16 / s + inv_s = inv_s.view((num_groups, 1)) + xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to( + fp8_dtype + ) + + xq = xq.view(x_og_shape) + xs = s.view((-1, xq.size(-1) // group_size)) + return xq, xs + + +def silu_mul_quant( + gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool +) -> tuple[torch.Tensor, torch.Tensor]: + assert gate.size(-1) % group_size == 0 + assert up.size(-1) % group_size == 0 + + assert gate.dtype == torch.bfloat16 + assert up.dtype == torch.bfloat16 + + act_bf16 = silu(gate) + assert act_bf16.dtype == torch.bfloat16 + + # act & mul + a_m = act_bf16 * up + assert a_m.dtype == torch.bfloat16 + + q, s = do_quant(a_m, group_size, ceil_ue8m0) + return q, s + + +def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor: + """ + pack float32 scales into a int32 tensor + """ + assert x.dtype == torch.float32 + E, T, G = x.size() + + # Add i32_padding here so we can view it as a i32 tensor later on. + i32_padding = round_up(G, 4) - G + ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda") + for e in range(E): + nt = tokens_per_expert[e].item() + ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23 + + ref_s_i32 = ref_s_i8.view(torch.int32) + + return ref_s_i32 + + +def ref_with_scale_fmt( + E: int, + T: int, + H: int, + group_size: int, + tokens_per_expert: torch.Tensor, + gate: torch.Tensor, + up: torch.Tensor, + scale_fmt: DeepGemmQuantScaleFMT, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + The precision types of the operations triggered by this function + match closely with the kernel implementation so we compare more + accurately. + """ + scale_dtype = ( + torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32 + ) + ceil_ue8m0 = scale_fmt in [ + DeepGemmQuantScaleFMT.UE8M0, + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + ] + + ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda") + ref_s_f32 = torch.empty( + (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda" + ) + + for e in range(E): + nt = tokens_per_expert[e].item() + if nt == 0: + continue + ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant( + gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0 + ) + + if scale_dtype == torch.float32: + return ref_q, ref_s_f32 + + assert scale_dtype == torch.int32 + return ref_q, pack_scales(ref_s_f32, tokens_per_expert) + + +def token_random(E, T, H2, tokens_per_expert): + """ + Initialize each token in a random range so we test a range of + scale values. + """ + y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda") + for e in range(E): + for t in range(tokens_per_expert[e].item()): + exp = random.choice(range(1, 20)) + y[e, t].uniform_(-(2**exp), 2**exp) + return y + + @pytest.mark.parametrize("E,T,H,fp8_type", CASES) @torch.inference_mode() -def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type): +def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype): group_size = 128 current_platform.seed_everything(42) - # Input tensor of shape (E, T, 2*H) - y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda") tokens_per_expert = torch.randint( low=0, high=T, @@ -54,71 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type): device="cuda", ) - # Run the SiLU V2 kernel - # TODO (varun): use_e8m0 is set to false as the reference impl does - # not handle that case. - y_q, y_s = persistent_masked_m_silu_mul_quant( - y, tokens_per_expert, group_size=group_size, use_ue8m0=False - ) + # Input tensor of shape (E, T, 2*H) + y = token_random(E, T, 2 * H, tokens_per_expert) - torch.cuda.synchronize() - fp8_info = torch.finfo(fp8_dtype) - fp8_max = fp8_info.max - fp8_min = fp8_info.min - eps = 1e-10 + gate = y[..., :H].to(torch.bfloat16) + up = y[..., H:].to(torch.bfloat16) - y1 = y[..., :H].float() - y2 = y[..., H:] - silu_x = y1 * torch.sigmoid(y1) - merged = silu_x * y2 + scale_fmts = [ + DeepGemmQuantScaleFMT.FLOAT32, + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + DeepGemmQuantScaleFMT.UE8M0, + ] - for e in range(E): - nt = tokens_per_expert[e].item() - ref_s = torch.empty( - (T, cdiv(H, group_size)), dtype=torch.float32, device="cuda" + # Run the SiLU V2 kernel + for scale_fmt in scale_fmts: + y_q, y_s = persistent_masked_m_silu_mul_quant( + y, + tokens_per_expert, + group_size=group_size, + quant_scale_fmt=scale_fmt, ) - ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda") - for t in range(nt): - data = merged[e, t].float() - ref_q_row = torch.empty_like(data) + ref_y_q, ref_y_s = ref_with_scale_fmt( + E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt + ) - # process full groups - n_full_groups = H // group_size - if n_full_groups > 0: - data_grp = data[: n_full_groups * group_size].view( - n_full_groups, group_size - ) - amax = data_grp.abs().amax(dim=1).clamp(min=eps) - scale = amax / fp8_max - scaled = data[: n_full_groups * group_size] / scale.repeat_interleave( - group_size - ) - ref_q_row[: n_full_groups * group_size] = scaled.clamp( - fp8_min, fp8_max - ).to(fp8_dtype) - ref_s[t, :n_full_groups] = scale - - # process remainder group - rem = H % group_size - if rem > 0: - data_rem = data[-rem:] - amax = data_rem.abs().amax().clamp(min=eps) - scale = amax / fp8_max - scaled = data_rem / scale - ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype) - ref_s[t, -1] = scale - - ref_q[t] = ref_q_row - - y_se = y_s[e].float() - y_qe = y_q[e].float() - - torch.testing.assert_close( - y_qe[:nt].to(torch.float32), - ref_q[:nt].to(torch.float32), - atol=2, - rtol=2e-1, + # deepgemm scales transform + dg_scales = None + if ( + has_deep_gemm() + and current_platform.has_device_capability(100) + and scale_fmt == DeepGemmQuantScaleFMT.UE8M0 + ): + from deep_gemm import transform_sf_into_required_layout + + _q, _s = ref_with_scale_fmt( + E, + T, + H, + group_size, + tokens_per_expert, + gate, + up, + scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + ) + dg_scales = transform_sf_into_required_layout( + sf=_s, + mn=_q.size(1), + k=_q.size(2), + recipe=(1, 128, 128), + num_groups=_q.size(0), + is_sfa=True, + ) + + expected_scale_dtype = ( + torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32 ) + assert y_s.dtype == expected_scale_dtype + assert ref_y_s.dtype == expected_scale_dtype - torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2) + for e in range(E): + nt = tokens_per_expert[e].item() + + torch.testing.assert_close( + y_q[e, :nt].to(torch.float32), + ref_y_q[e, :nt].to(torch.float32), + ) + + if scale_fmt == DeepGemmQuantScaleFMT.UE8M0: + G = H // group_size + y_s_sliced = as_uint8(y_s[e]) + ref_s_sliced = as_uint8(ref_y_s[e]) + torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G]) + if dg_scales is not None: + dg_sliced = as_uint8(dg_scales[e]) + torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G]) + else: + torch.testing.assert_close( + y_s[e, :nt], + ref_y_s[e, :nt], + ) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 869082f8231d..79c92eb48612 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -13,14 +14,33 @@ from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, fp8_m_grouped_gemm_nt_masked, get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, ) +from vllm.utils.math_utils import cdiv logger = init_logger(__name__) +def scales_shape_stride_dtype( + E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT +) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]: + shape = (E, T, G) + strides = (T * G, 1, T) + if quant_scale_fmt in [ + DeepGemmQuantScaleFMT.FLOAT32, + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + ]: + return shape, strides, torch.float32 + + assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0 + shape = (E, T, cdiv(G, 4)) + strides = (T * cdiv(G, 4), 1, T) + return shape, strides, torch.int32 + + @triton.jit def _silu_mul_fp8_quant_deep_gemm( # Pointers ------------------------------------------------------------ @@ -49,7 +69,7 @@ def _silu_mul_fp8_quant_deep_gemm( eps: tl.constexpr, fp8_min: tl.constexpr, fp8_max: tl.constexpr, - use_ue8m0: tl.constexpr, + ceil_ue8m0: tl.constexpr, # Meta --------------------------------------------------------------- BLOCK: tl.constexpr, NUM_STAGES: tl.constexpr, @@ -86,7 +106,7 @@ def _silu_mul_fp8_quant_deep_gemm( y = gate * up y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max - if use_ue8m0: + if ceil_ue8m0: y_s = tl.exp2(tl.ceil(tl.log2(y_s))) y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) @@ -100,7 +120,7 @@ def persistent_masked_m_silu_mul_quant( tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert num_parallel_tokens=16, group_size: int = 128, - use_ue8m0: bool | None = None, + quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32, ) -> tuple[torch.Tensor, torch.Tensor]: """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales y has shape (E, T, 2*H). The first half of the last dimension is @@ -137,7 +157,13 @@ def persistent_masked_m_silu_mul_quant( Returns `(y_q, y_s)` where * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] - * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + * `y_s` depends on quant_scale_fmt, + - quant_scale_fmt == FLOAT32, + `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + - quant_scale_fmt == E8M0, + `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T) + - quant_scale_fmt == E8M0_FLOAT32_SPARSE + `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) Let NUM_WARPS be the number of warps in a single thread block and `GROUP_SIZE = 128` be the size of the quantization group. """ @@ -155,17 +181,18 @@ def persistent_masked_m_silu_mul_quant( fp8_dtype = torch.float8_e4m3fn y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) - stride_ys_e = T * G - stride_ys_t = 1 - stride_ys_g = T + ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt) y_s = torch.empty_strided( - (E, T, G), - (stride_ys_e, stride_ys_t, stride_ys_g), - dtype=torch.float32, + ys_shape, + ys_strides, + dtype=ys_dtype, device=y.device, ) - use_ue8m0 = use_ue8m0 if use_ue8m0 is not None else is_deep_gemm_e8m0_used() + ceil_ue8m0 = quant_scale_fmt in [ + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + DeepGemmQuantScaleFMT.UE8M0, + ] cuda_arch = current_platform.get_device_capability( device_id=y.device.index @@ -173,7 +200,7 @@ def persistent_masked_m_silu_mul_quant( if cuda_arch >= 80: torch.ops._C.persistent_masked_m_silu_mul_quant( - y, tokens_per_expert, y_q, y_s, use_ue8m0 + y, tokens_per_expert, y_q, y_s, ceil_ue8m0 ) else: stride_cnt_e = tokens_per_expert.stride()[0] @@ -189,6 +216,10 @@ def persistent_masked_m_silu_mul_quant( fp8_max = f_info.max fp8_min = f_info.min eps: float = 1e-10 + assert y_s.dtype == torch.float32, ( + "_silu_mul_fp8_quant_deep_gemm does" + "not support {y_s.dtype} scales. Only torch.float32 supported." + ) _silu_mul_fp8_quant_deep_gemm[grid]( y, y_q, @@ -202,14 +233,14 @@ def persistent_masked_m_silu_mul_quant( stride_yq_e, stride_yq_t, stride_yq_h, - stride_ys_e, - stride_ys_t, - stride_ys_g, + ys_strides[0], + ys_strides[1], + ys_strides[2], stride_cnt_e, eps, fp8_min, fp8_max, - is_deep_gemm_e8m0_used(), + ceil_ue8m0, BLOCK=group_size, NUM_STAGES=4, num_warps=1, @@ -255,7 +286,7 @@ def supports_packed_ue8m0_act_scales(self) -> bool: """ DeepGemm supports packed ue8m0 activation scales format in devices == sm100 """ - return current_platform.is_device_capability(100) + return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100) def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. @@ -329,10 +360,17 @@ def apply( expected_m, ) + quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle() a2q, a2q_scale = persistent_masked_m_silu_mul_quant( - workspace1, expert_num_tokens + workspace1, + expert_num_tokens, + quant_scale_fmt=quant_scale_fmt, ) fp8_m_grouped_gemm_nt_masked( - (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m + (a2q, a2q_scale), + (w2, self.w2_scale), + output, + expert_num_tokens, + expected_m, ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 4c15baf7a8f9..b5ab37534dd7 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -9,6 +9,7 @@ import importlib import os from collections.abc import Callable +from enum import Enum from typing import Any, NoReturn import torch @@ -20,6 +21,28 @@ from vllm.utils.math_utils import cdiv +class DeepGemmQuantScaleFMT(Enum): + # Float32 scales in Float32 tensor + FLOAT32 = 0 + # Compute float32 scales and ceil the scales to UE8M0. + # Keep the scales in Float32 tensor. + FLOAT32_CEIL_UE8M0 = 1 + # Compute float32 scales and ceil the scales to UE8M0. + # Pack the scales into a int32 tensor where each int32 + # element contains 4 scale values. + UE8M0 = 2 + + @staticmethod + def from_oracle() -> "DeepGemmQuantScaleFMT": + if not is_deep_gemm_e8m0_used(): + return DeepGemmQuantScaleFMT.FLOAT32 + return ( + DeepGemmQuantScaleFMT.UE8M0 + if current_platform.is_device_capability(100) + else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 + ) + + @functools.cache def is_deep_gemm_supported() -> bool: """Return `True` if DeepGEMM is supported on the current platform. From 119c4927b33f78cb8bb2283a57ee0e3a14021777 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Thu, 13 Nov 2025 19:18:47 +0100 Subject: [PATCH 021/578] [Bugfix] Fix validate model input for decoder models (#27099) Signed-off-by: Yannick Schnider Signed-off-by: Yannick Schnider Signed-off-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: Roger Wang --- tests/v1/e2e/test_context_length.py | 63 +++++++++++++++++++++++++++++ vllm/v1/engine/processor.py | 15 +++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/v1/e2e/test_context_length.py diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py new file mode 100644 index 000000000000..0ac40bec35fe --- /dev/null +++ b/tests/v1/e2e/test_context_length.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for vLLM `vllm/v1/engine/processor.Processor._validate_model_input()` +handling of maximum context length for decoder models. + +This test ensures: +- A prompt that is one token shorter than the model's maximum context length + can be processed successfully when requesting one additional token. +- A prompt that reaches the model's maximum context length throws a + `ValueError` when requesting at least one additional token. +""" + +import pytest + +from tests.conftest import VllmRunner +from tests.utils import create_new_process_for_each_test + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("model, max_model_len", [("JackFram/llama-160m", 2048)]) +@pytest.mark.parametrize( + "prompt_len, max_tokens", + [ + (2047, 1), # prompt_len = max_model_len - 1 -> allowed + (2048, 1), # prompt_len = max_model_len -> not allowed + ], +) +def test_decoder_max_context_length_validation( + model: str, + max_model_len: int, + vllm_runner: type[VllmRunner], + prompt_len: int, + max_tokens: int, +) -> None: + """Check vLLM decoder model input validation for edge cases where + the prompt length is (almost) equal to the max model length.""" + + prompt_ids = [[43] * prompt_len] + + with vllm_runner( + model_name=model, + tokenizer_name=model, + max_model_len=max_model_len, + max_num_seqs=1, + tensor_parallel_size=1, + ) as vllm_model: + if prompt_len + max_tokens <= max_model_len: + # Should succeed as constraints are met + vllm_model.generate_greedy(prompt_ids, max_tokens) + else: + # Should raise the ValueError defined in + # vllm/v1/engine/processor.Processor_validate_model_input() + expected_msg = ( + f"The decoder prompt (length {prompt_len}) plus the number of " + f"requested output tokens (at least 1) is longer than " + f"the maximum model length of {max_model_len}. " + "Make sure that `max_model_len` is no smaller than the number of " + "text tokens (prompt + requested output tokens)." + ) + with pytest.raises(ValueError) as excinfo: + vllm_model.generate_greedy(prompt_ids, max_tokens) + assert expected_msg in str(excinfo.value) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index f2d992403e1a..69509d5d4712 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -575,6 +575,21 @@ def _validate_model_input( # check that chunked prefill does not truncate them # max_batch_len = self.scheduler_config.max_num_batched_tokens + if ( + prompt_len == max_prompt_len + and prompt_type == "decoder" + and not model_config.is_multimodal_model + ): + suggestion = ( + "Make sure that `max_model_len` is no smaller than the " + "number of text tokens (prompt + requested output tokens)." + ) + raise ValueError( + f"The {prompt_type} prompt (length {prompt_len}) plus the number of " + f"requested output tokens (at least 1) is longer than the maximum " + f"model length of {max_prompt_len}. {suggestion}" + ) + def stat_mm_cache(self) -> MultiModalCacheStats | None: return self.input_preprocessor.stat_mm_cache() From f9f3b596f374c4a01acef275ee1f35398bb05164 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 13 Nov 2025 12:20:01 -0600 Subject: [PATCH 022/578] [Attention][Bugfix] Fix FA sink support (#28660) Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/flash_attn.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index bfb4a45c2b56..81623549ae85 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -130,6 +130,12 @@ def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool: return flash_attn_supports_fp8() return kv_cache_dtype in ["auto"] + @classmethod + def supports_sink(cls) -> bool: + if not is_flash_attn_varlen_func_available(): + return False + return flash_attn_supports_sinks() + @classmethod def supports_compute_capability(cls, capability: DeviceCapability) -> bool: return capability >= DeviceCapability(8, 0) From 5d6ce2b9601f3251487e44eb9e00c098101c4af6 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Fri, 14 Nov 2025 02:21:25 +0800 Subject: [PATCH 023/578] [Perf] Support stream interval for reducing host overhead (#27869) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: Nick Hill --- tests/v1/engine/test_output_processor.py | 18 ++++++++++-- vllm/config/scheduler.py | 6 ++++ vllm/engine/arg_utils.py | 6 ++++ vllm/v1/engine/async_llm.py | 3 +- vllm/v1/engine/llm_engine.py | 3 +- vllm/v1/engine/output_processor.py | 36 +++++++++++++++++++++++- 6 files changed, 67 insertions(+), 5 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index d77a119ec60f..8e1198b315bd 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -49,10 +49,15 @@ def _ref_convert_id_to_token( @pytest.mark.parametrize( "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] ) +@pytest.mark.parametrize("stream_interval", [1, 5, 10]) def test_incremental_detokenization( - request_output_kind: RequestOutputKind, dummy_test_vectors + request_output_kind: RequestOutputKind, + stream_interval: int, + dummy_test_vectors, ): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) + output_processor = OutputProcessor( + dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval + ) engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens) # Make N requests. @@ -104,9 +109,18 @@ def test_incremental_detokenization( if request_id not in gen_strings: gen_strings[request_id] = new_text gen_tokens[request_id] = new_tokens + if request_output_kind == RequestOutputKind.DELTA: + assert len(new_tokens) == 1, f"{len(new_tokens)=}" else: gen_strings[request_id] += new_text gen_tokens[request_id].extend(new_tokens) + if ( + request_output_kind == RequestOutputKind.DELTA + and not request_output.finished + ): + assert len(new_tokens) >= stream_interval, ( + f"{len(new_tokens)=}, {stream_interval=}" + ) # Confirmed tracked values matches what we expected. for idx, (ref_gen_str, ref_gen_toks) in enumerate( diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 47aa343527b3..71a06e167fd9 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -142,6 +142,12 @@ class SchedulerConfig: speculative decoding and pipeline parallelism. """ + stream_interval: int = Field(default=1, ge=1) + """The interval (or buffer size) for streaming in terms of token length. + A smaller value (1) makes streaming smoother by sending each token immediately, + while a larger value (e.g., 10) reduces host overhead and may increase throughput + by batching multiple tokens before sending.""" + def get_scheduler_cls(self) -> type["SchedulerInterface"]: if self.scheduler_cls is None: if self.async_scheduling: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca7f5e5e3e05..b025004ea022 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -558,6 +558,8 @@ class EngineArgs: async_scheduling: bool | None = SchedulerConfig.async_scheduling + stream_interval: int = SchedulerConfig.stream_interval + kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill kv_offloading_size: float | None = CacheConfig.kv_offloading_size @@ -1067,6 +1069,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: scheduler_group.add_argument( "--async-scheduling", **scheduler_kwargs["async_scheduling"] ) + scheduler_group.add_argument( + "--stream-interval", **scheduler_kwargs["stream_interval"] + ) # Compilation arguments compilation_kwargs = get_kwargs(CompilationConfig) @@ -1562,6 +1567,7 @@ def create_engine_config( long_prefill_token_threshold=self.long_prefill_token_threshold, disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager, async_scheduling=self.async_scheduling, + stream_interval=self.stream_interval, ) if not model_config.is_multimodal_model and self.default_mm_loras: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index aee21fb3fffe..48ea6ef8515c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -120,8 +120,9 @@ def __init__( ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). + stream_interval = self.vllm_config.scheduler_config.stream_interval self.output_processor = OutputProcessor( - self.tokenizer, log_stats=self.log_stats + self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval ) endpoint = self.observability_config.otlp_traces_endpoint if endpoint is not None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 6224af5700b7..1db83446ba0b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -96,8 +96,9 @@ def __init__( ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). + stream_interval = self.vllm_config.scheduler_config.stream_interval self.output_processor = OutputProcessor( - self.tokenizer, log_stats=self.log_stats + self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval ) endpoint = self.observability_config.otlp_traces_endpoint if endpoint is not None: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index d8d03f19d466..bdbbfe2595f8 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -104,6 +104,7 @@ def __init__( arrival_time: float, queue: RequestOutputCollector | None, log_stats: bool, + stream_interval: int, top_p: float | None = None, n: int | None = None, temperature: float | None = None, @@ -131,6 +132,10 @@ def __init__( self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None + # Stream Interval + self.stream_interval = stream_interval + self.sent_tokens_offset = 0 # Offset of sent tokens + @classmethod def from_new_request( cls, @@ -141,6 +146,7 @@ def from_new_request( request_index: int, queue: RequestOutputCollector | None, log_stats: bool, + stream_interval: int, ) -> "RequestState": if sampling_params := request.sampling_params: if not sampling_params.detokenize: @@ -188,6 +194,7 @@ def from_new_request( arrival_time=request.arrival_time, queue=queue, log_stats=log_stats, + stream_interval=stream_interval, ) def make_request_output( @@ -205,6 +212,29 @@ def make_request_output( # Only the final output is required in FINAL_ONLY mode. return None + if self.stream_interval > 1: + assert self.detokenizer is not None + + # Send output request only when + # 1. It has finished, or + # 2. It is the first token, or + # 3. It has reached the stream interval number of tokens + if not ( + finished + or self.sent_tokens_offset == 0 + or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset + >= self.stream_interval + ): + return None + + if self.output_kind == RequestOutputKind.DELTA: + # Send tokens from the offset in DELTA mode, otherwise all + # tokens are sent. + new_token_ids = self.detokenizer.output_token_ids[ + self.sent_tokens_offset : + ] + self.sent_tokens_offset = len(self.detokenizer.output_token_ids) + request_id = self.request_id if pooling_output is not None: return self._new_request_output( @@ -310,9 +340,12 @@ def _new_pooling_output( class OutputProcessor: """Process EngineCoreOutputs into RequestOutputs.""" - def __init__(self, tokenizer: AnyTokenizer, log_stats: bool): + def __init__( + self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1 + ): self.log_stats = log_stats self.tokenizer = tokenizer + self.stream_interval = stream_interval self.request_states: dict[str, RequestState] = {} self.parent_requests: dict[str, ParentRequest] = {} self.lora_states = LoRARequestStates(log_stats) @@ -385,6 +418,7 @@ def add_request( request_index=request_index, queue=queue, log_stats=self.log_stats, + stream_interval=self.stream_interval, ) self.request_states[request_id] = req_state if parent_req: From 968060c15adc0b68a76d37db00acf1273a23b829 Mon Sep 17 00:00:00 2001 From: Qiu Date: Fri, 14 Nov 2025 03:29:22 +0800 Subject: [PATCH 024/578] [bugfix] correct local_chunk_len for DCP in reorg_kvcache with long context (#28526) Signed-off-by: QiuChunshuo Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/v1/attention/backends/mla/common.py | 29 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 467c01cd9d06..2ccdd1f143ce 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -337,6 +337,7 @@ class ChunkedContextMetadata: local_context_lens_allranks: list[list[int]] | None = None padded_local_cu_seq_lens: torch.Tensor | None = None cu_seq_lens_lst: list[list[int]] | None = None + chunk_size: int | None = None block_table: torch.Tensor query_start_loc: torch.Tensor @@ -902,6 +903,7 @@ def build( device, non_blocking=True ), cu_seq_lens_lst=cu_seq_lens_cpu.tolist(), + chunk_size=padded_local_max_context_chunk_across_ranks, ) else: chunked_context_metadata = chunked_context_metadata_cls( @@ -986,6 +988,8 @@ def reorg_kvcache( local_context_lens_allranks: list[list[int]], sum_seq_len: int, max_seq_len: int, + chunk_size: int, + chunk_idx: int, toks: int, ) -> tuple[torch.Tensor, torch.Tensor]: """ @@ -1001,6 +1005,9 @@ def reorg_kvcache( local_context_lens_allranks: local context lengths on each CP rank. sum_seq_len: the sum of cp_chunk_seq_lens_lst. max_seq_len: the max value of cp_chunk_seq_lens_lst. + chunk_size: the local padded max context chunk from + chunked_context_metadata building. + chunk_idx: chunk idx of chunked_prefill. toks: the number of tokens for local gather cache. """ kv_c_segments = [] @@ -1012,20 +1019,31 @@ def reorg_kvcache( ): cur_seq_len = 0 for rank, local_context_len in enumerate(local_context_lens): - if local_context_len != 0: + # Note(qcs): We split the context into multiple chunks, + # depending on the size of the workspace. + # local_context in dcp0: |-----------------| + # local_context in dcp1: |--------------| + # n*padded_local_chunk: |-----|-----|-----| + # local_chunk_len in dcp1: |-----|-----|--| + # so we need update the last chunk length in dcp1. + local_chunk_len = min( + max(0, local_context_len - chunk_idx * chunk_size), + padded_local_chunk_seq_len, + ) + if local_chunk_len != 0: kv_c_segment = allgatered_kv_c_normed[ rank * toks + src_token_idx : rank * toks + src_token_idx - + local_context_len + + local_chunk_len ] k_pe_segment = allgatered_k_pe[ rank * toks + src_token_idx : rank * toks + src_token_idx - + local_context_len + + local_chunk_len ] kv_c_segments.append(kv_c_segment) k_pe_segments.append(k_pe_segment) - cur_seq_len += local_context_len + cur_seq_len += local_chunk_len max_seq_len_check = max(max_seq_len_check, cur_seq_len) src_token_idx += padded_local_chunk_seq_len reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0) @@ -1676,6 +1694,7 @@ def _context_parallel_compute_prefill_context( assert prefill_metadata.chunked_context.local_context_lens_allranks is not None assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None + assert prefill_metadata.chunked_context.chunk_size is not None output = None iters = len(prefill_metadata.chunked_context.seq_tot) @@ -1725,6 +1744,8 @@ def _context_parallel_compute_prefill_context( local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks, sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1], max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i], + chunk_size=prefill_metadata.chunked_context.chunk_size, + chunk_idx=i, toks=toks, ) From 262d263f6c56fa95e15422d3a475da8efdf67cc1 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Thu, 13 Nov 2025 12:09:05 -0800 Subject: [PATCH 025/578] [Bugfix] Eliminate tuple inputs to submodules in graph partitioning (#28533) Signed-off-by: Yanan Cao --- .buildkite/test-pipeline.yaml | 1 + tests/compile/test_graph_partition.py | 124 ++++++++++++++++++++++++++ vllm/compilation/backends.py | 17 +++- 3 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 tests/compile/test_graph_partition.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index be1b79ddc432..52539728215b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -445,6 +445,7 @@ steps: - vllm/ - tests/compile commands: + - pytest -v -s compile/test_graph_partition.py - pytest -v -s compile/test_config.py - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py new file mode 100644 index 000000000000..1cd783843a62 --- /dev/null +++ b/tests/compile/test_graph_partition.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import operator + +import pytest +import torch +from torch.fx.experimental.proxy_tensor import make_fx + +from vllm.compilation.backends import split_graph + + +def test_getitem_moved_to_producer_subgraph(): + """ + Test that getitem operations are moved to the same subgraph as their input, + preventing tuple inputs to submodules. + """ + + def model_fn(x: torch.Tensor) -> torch.Tensor: + # torch.split returns a tuple, creating real getitem operations + # Should become first submodule that produces tuple + chunks = torch.split(x, x.shape[0] // 2, dim=0) + + # Following ops should become second submodule that consumes tuple + result_0 = torch.relu(chunks[0]) + result_1 = torch.relu(chunks[1]) + return torch.cat([result_0, result_1], dim=0) + + x = torch.randn(4, 3) + gm = make_fx(model_fn)(x) + + has_getitem = any( + node.op == "call_function" and node.target == operator.getitem + for node in gm.graph.nodes + ) + assert has_getitem, "Test setup failed: graph should contain getitem operations" + + # Split on tuple producer aten::split + split_ops = ["aten::split.Tensor"] + split_gm, split_items = split_graph(gm, split_ops) + assert len(split_items) == 2, "Graph should be split into 2 submodules" + + for split_item in split_items: + submodule = split_item.graph + + getitem_on_placeholder = [] + for node in submodule.graph.nodes: + if ( + node.op == "call_function" + and node.target == operator.getitem + and node.args[0].op == "placeholder" + ): + getitem_on_placeholder.append(node) + + assert len(getitem_on_placeholder) == 0, ( + f"Submodule {split_item.submod_name} has getitem operations on " + f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. " + "This means tuple inputs were not properly eliminated." + ) + + new_x = torch.randn(4, 3) + output_original = gm(new_x) + output_split = split_gm(new_x) + + assert torch.allclose(output_original, output_split), "Output mismatch" + + +def test_no_tuple_inputs_with_multiple_consumers(): + """ + Test that when a tuple is consumed by multiple split operations, + getitem operations are properly moved to avoid tuple inputs. + """ + + def model_fn(x: torch.Tensor) -> torch.Tensor: + # torch.split returns a tuple, creating real getitem operations + # Should become first submodule that produces tuple + chunks = torch.split(x, x.shape[0] // 2, dim=0) + + # These should become second submodule consuming tuple + result_1 = torch.relu(chunks[0]) + result_2 = torch.relu(chunks[1]) + + # Artificial graph splitting point to create another + # independent submodule that consumes tuple later + # This would become the third submodule + result_1 = torch.sigmoid(result_1) + + # Fourth submodule that consumes tuple + result = torch.cat([chunks[0], chunks[1], result_1, result_2]) + return result + + x = torch.randn(4, 3) + gm = make_fx(model_fn)(x) + + has_getitem = any( + node.op == "call_function" and node.target == operator.getitem + for node in gm.graph.nodes + ) + assert has_getitem, "Test setup failed: graph should contain getitem operations" + + split_ops = ["aten::split.Tensor", "aten::sigmoid"] + split_gm, split_items = split_graph(gm, split_ops) + assert len(split_items) == 4, "Graph should be split into 4 submodules" + + for split_item in split_items: + submodule = split_item.graph + + for node in submodule.graph.nodes: + if ( + node.op == "call_function" + and node.target == operator.getitem + and node.args[0].op == "placeholder" + ): + pytest.fail( + f"Submodule {split_item.submod_name} has getitem on " + f"placeholder {node.args[0].name}, indicating it receives " + "a tuple input" + ) + + new_x = torch.randn(4, 3) + output_original = gm(new_x) + output_split = split_gm(new_x) + + assert torch.allclose(output_original, output_split), "Output mismatch after split" diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index be69075f94f0..60ef6eef2166 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -4,6 +4,7 @@ import ast import dataclasses import hashlib +import operator import os import pprint import time @@ -307,12 +308,24 @@ def split_graph( ) -> tuple[fx.GraphModule, list[SplitItem]]: # split graph by ops subgraph_id = 0 - node_to_subgraph_id = {} - split_op_graphs = [] + node_to_subgraph_id: dict[fx.Node, int] = {} + split_op_graphs: list[int] = [] for node in graph.graph.nodes: if node.op in ("output", "placeholder"): continue + # Check if this is a getitem operation on a node from an earlier subgraph. + # If so, assign it to the same subgraph as its input to avoid passing entire + # tuple as input to submodules, which is against standalone_compile and + # AoTAutograd input requirement. + if node.op == "call_function" and node.target == operator.getitem: + # Assign this getitem to the same subgraph as its input + input_node = node.args[0] + if input_node.op != "placeholder": + assert input_node in node_to_subgraph_id + node_to_subgraph_id[node] = node_to_subgraph_id[input_node] + continue + if should_split(node, splitting_ops): subgraph_id += 1 node_to_subgraph_id[node] = subgraph_id From faed7bf07ec831529c5ed54e15b21e30b30dc16e Mon Sep 17 00:00:00 2001 From: Kebe Date: Fri, 14 Nov 2025 05:48:08 +0900 Subject: [PATCH 026/578] [Bugfix] [CPU] bump torch to 2.9.0 for Darwin to fix segmentation fault (#27791) Signed-off-by: Kebe Signed-off-by: Michael Goin Co-authored-by: Michael Goin --- requirements/cpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 605ce73bff9c..d11787df4d92 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -8,7 +8,7 @@ packaging>=24.2 setuptools>=77.0.3,<81.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" -torch==2.8.0; platform_system == "Darwin" +torch==2.9.0; platform_system == "Darwin" torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch From 1b622deba73347f044c13fa80a09a5647d21a45c Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 13 Nov 2025 13:01:43 -0800 Subject: [PATCH 027/578] [Misc] Update CODEOWNERS for simon-mo and comaniac (#28675) Signed-off-by: Simon Mo --- .github/CODEOWNERS | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f26c782bccf2..bfb0e91fd06e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,8 +3,8 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep @@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people -/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg -/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 +/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg +/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 # vLLM V1 /vllm/v1/attention @LucasWilkinson /vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/triton_attn.py @tdoublep -/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC /vllm/v1/sample @22quinn @houseroad @njhill /vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett @@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/offloading @ApostaC # Test ownership -/.buildkite/lm-eval-harness @mgoin @simon-mo +/.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche /tests/evals @mgoin /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 @@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep From e64011f29a63ef9c4fc67bad1fd42af4f3cfad35 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 13 Nov 2025 17:19:35 -0500 Subject: [PATCH 028/578] [CI] Bug: Fix ci entrypoint pooling (#28684) Signed-off-by: yewentao256 --- vllm/v1/engine/processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 69509d5d4712..0404f6ff2771 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -579,6 +579,7 @@ def _validate_model_input( prompt_len == max_prompt_len and prompt_type == "decoder" and not model_config.is_multimodal_model + and self.model_config.runner_type != "pooling" ): suggestion = ( "Make sure that `max_model_len` is no smaller than the " From 6e25b1cddfd78eab307acdb5e3ec14475e465d90 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Thu, 13 Nov 2025 23:30:59 +0000 Subject: [PATCH 029/578] [KV Connector] Test async mode in scheduler tests (#28550) Signed-off-by: Mark McLoughlin --- tests/v1/core/test_scheduler.py | 100 ++++++++++++++++++---------- tests/v1/core/utils.py | 24 +++++-- tests/v1/kv_connector/unit/utils.py | 86 +++++++++++++++++++++++- 3 files changed, 165 insertions(+), 45 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index d5b829e79b8f..d31338220fca 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -31,11 +31,11 @@ KVCacheConfig, KVCacheGroupSpec, ) -from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput +from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from .utils import EOS_TOKEN_ID, create_requests, create_scheduler +from .utils import EOS_TOKEN_ID, create_requests, create_scheduler, mock_kv pytestmark = pytest.mark.cpu_test @@ -888,27 +888,65 @@ def _step_until_done( all_finished = all_done -def test_kv_connector_basic(): +def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]): + """Cycle requests through a KV transfer cyle.""" + + # Requests should first transition to WAITING_FOR_REMOTE_KVS + output = scheduler.schedule() + assert len(scheduler.waiting) == len(req_ids) + assert len(scheduler.running) == 0 + assert len(output.scheduled_new_reqs) == 0 + for req in scheduler.requests.values(): + assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS + + # No model execution yet + EMPTY_OUTPUT = ModelRunnerOutput( + req_ids=[], + req_id_to_index={}, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, EMPTY_OUTPUT) + + # Simulate KV transfer completion using KVConnectorOutput.finished_recving + output = scheduler.schedule() + assert len(scheduler.waiting) == len(req_ids) + assert len(scheduler.running) == 0 + + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=[], + req_id_to_index={}, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + kv_connector_output=KVConnectorOutput(finished_recving=req_ids), + ) + scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + for req_id in req_ids: + assert req_id in scheduler.finished_recving_kv_req_ids + + +@pytest.mark.parametrize("is_async", [False, True]) +def test_kv_connector_basic(is_async: bool): """ Test whether Scheduler with KVConnector schedules tokens, allocates memory, and cleans up requests as expected under normal operation. """ # Setup Scheduler. + BLOCK_SIZE = 16 + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler = create_scheduler( enable_prefix_caching=True, - use_kv_connector=True, + use_kv_connector=mock_kv( + matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async + ), + block_size=BLOCK_SIZE, ) NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks() - BLOCK_SIZE = scheduler.cache_config.block_size - - # Mock External Cache Hit. - NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 - scheduler.connector.get_num_new_matched_tokens = Mock(name="method") - scheduler.connector.get_num_new_matched_tokens.return_value = ( - NUM_MATCHED_NEW_TOKENS, - False, - ) ###################################################### # FIRST SET OF REQUESTS - External Hit Only @@ -928,6 +966,9 @@ def test_kv_connector_basic(): req_ids.append(request.request_id) req_to_index[request.request_id] = i + if is_async: + _step_until_kv_transfer_finished(scheduler, req_ids) + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, @@ -978,6 +1019,9 @@ def test_kv_connector_basic(): req_ids.append(request.request_id) req_to_index[request.request_id] = i + if is_async: + _step_until_kv_transfer_finished(scheduler, req_ids) + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, @@ -1020,17 +1064,10 @@ def test_external_prefix_cache_metrics(): """ # Setup Scheduler. + NUM_MATCHED_NEW_TOKENS = 4 scheduler = create_scheduler( enable_prefix_caching=False, - use_kv_connector=True, - ) - - # Mock connector to simulate a partial external cache hit - NUM_MATCHED_NEW_TOKENS = 4 - scheduler.connector.get_num_new_matched_tokens = Mock(name="method") - scheduler.connector.get_num_new_matched_tokens.return_value = ( - NUM_MATCHED_NEW_TOKENS, - False, + use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False), ) # --- Prepare simple requests --- @@ -1085,21 +1122,16 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role): # Setup Scheduler With Mock External Cache Hit. BLOCK_SIZE = 4 NUM_BLOCKS = 10 + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler = create_scheduler( enable_prefix_caching=True, - use_kv_connector=True, + use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False), block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, # encoder connector should not affect test results use_ec_connector=use_ec_connector, ec_role=ec_role, ) - NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 - scheduler.connector.get_num_new_matched_tokens = Mock(name="method") - scheduler.connector.get_num_new_matched_tokens.return_value = ( - NUM_MATCHED_NEW_TOKENS, - False, - ) # Create two requests. The second request will not be able to # allocate slots because it will not have enough blocks. @@ -1174,9 +1206,10 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role): BLOCK_SIZE = 2 # NOTE: there is 1 null block, so this is 6 blocks. NUM_BLOCKS = 7 + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE scheduler = create_scheduler( enable_prefix_caching=True, - use_kv_connector=True, + use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False), block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, # encoder connector should not affect test results @@ -1184,13 +1217,6 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role): ec_role=ec_role, ) - NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE - scheduler.connector.get_num_new_matched_tokens = Mock(name="method") - scheduler.connector.get_num_new_matched_tokens.return_value = ( - NUM_MATCHED_NEW_TOKENS, - False, - ) - # Create two requests. # Both can be scheduled at first, but the second request # will be preempted and re-scheduled. diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 3692e633322e..65511c17473b 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -3,6 +3,7 @@ import torch +from tests.v1.kv_connector.unit.utils import MockKVConfig from vllm.config import ( CacheConfig, ECTransferConfig, @@ -33,6 +34,10 @@ EOS_TOKEN_ID = 50256 +def mock_kv(matched_tokens: int, is_async: bool): + return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async) + + def create_scheduler( model: str = "facebook/opt-125m", max_num_seqs: int = 16, @@ -40,7 +45,7 @@ def create_scheduler( enable_prefix_caching: bool | None = None, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, - use_kv_connector: bool = False, + use_kv_connector: None | bool | MockKVConfig = None, num_blocks: int = 10000, block_size: int = 16, max_model_len: int | None = None, @@ -94,15 +99,22 @@ def create_scheduler( cache_dtype="auto", **kwargs_cache, ) - kv_transfer_config = ( - KVTransferConfig( + kv_transfer_config = None + if isinstance(use_kv_connector, MockKVConfig): + kv_transfer_config = KVTransferConfig( + kv_connector="MockKVConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "matched_tokens": use_kv_connector.matched_tokens, + "is_async": use_kv_connector.is_async, + }, + ) + elif use_kv_connector: + kv_transfer_config = KVTransferConfig( kv_connector="SharedStorageConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) - if use_kv_connector - else None - ) speculative_config: SpeculativeConfig | None = None if num_speculative_tokens is not None: diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index f0031643aa9d..f35f91bb3adf 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -3,7 +3,8 @@ import tempfile from collections import defaultdict from collections.abc import Callable -from itertools import count +from dataclasses import dataclass +from itertools import chain, count from typing import Any import torch @@ -18,13 +19,18 @@ VllmConfig, ) from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa SharedStorageConnector, ) from vllm.utils.hashing import sha256 from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash -from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.core.sched.scheduler import Scheduler, SchedulerOutput from vllm.v1.kv_cache_interface import ( FullAttentionSpec, KVCacheConfig, @@ -307,6 +313,82 @@ def wrapper(*args, **kwargs): return attr +@dataclass(frozen=True) +class MockKVConfig: + matched_tokens: int = 0 + is_async: bool = False + + +class MockKVConnectorMetadata(KVConnectorMetadata): + def __init__(self): + # Scheduler tests check metadata.requests + self.requests: list = [] + + +class MockKVConnector(KVConnectorBase_V1): + """Mock KV connector for scheduler tests, supporting both sync and async mode.""" + + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: KVCacheConfig | None = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + extra_config = self._kv_transfer_config.kv_connector_extra_config + self.config = MockKVConfig( + matched_tokens=extra_config["matched_tokens"], + is_async=extra_config["is_async"], + ) + + def get_num_new_matched_tokens( + self, + request: Request, + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + return (self.config.matched_tokens, self.config.is_async) + + def update_state_after_alloc( + self, + request: Request, + blocks: KVCacheBlocks, + num_external_tokens: int, + ): + pass + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + metadata = MockKVConnectorMetadata() + cached_reqs = scheduler_output.scheduled_cached_reqs + for req_id in chain( + (req.req_id for req in scheduler_output.scheduled_new_reqs), + ( + req_id + for req_id in cached_reqs.req_ids + if req_id in cached_reqs.resumed_req_ids + ), + ): + metadata.requests.append({"req_id": req_id}) + return metadata + + def start_load_kv(self, kv_caches, finished_req_ids): + pass + + def wait_for_layer_load(self, layer_name): + pass + + def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs): + pass + + def wait_for_save(self): + pass + + KVConnectorFactory.register_connector( "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__ ) + +KVConnectorFactory.register_connector( + "MockKVConnector", __name__, MockKVConnector.__name__ +) From f2b8e1c5510cf3621dc4b910f0eba5289d9fee88 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 13 Nov 2025 18:16:34 -0600 Subject: [PATCH 030/578] Mirrored test group definitions for AMD (2025-11-11) (#28573) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 163 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 153 insertions(+), 10 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 5fd048c2ad0c..e232000511c3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -226,6 +226,27 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + #- export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - label: EPLB Algorithm Test # 5min mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -238,11 +259,11 @@ steps: commands: - pytest -v -s distributed/test_eplb_algo.py -- label: EPLB Execution Test # 5min +- label: EPLB Execution Test # 10min mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 # grade: Blocking - timeout_in_minutes: 15 + timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -250,6 +271,7 @@ steps: - tests/distributed/test_eplb_execute.py commands: - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 @@ -273,7 +295,7 @@ steps: - label: Regression Test # 7min timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 grade: Blocking source_file_dependencies: @@ -288,7 +310,7 @@ steps: timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 - #grade: Blocking + # grade: Blocking source_file_dependencies: - vllm/ - tests/engine @@ -337,6 +359,7 @@ steps: - tests/v1 commands: # split the test to avoid interference + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s v1/executor - pytest -v -s v1/kv_offload @@ -344,7 +367,7 @@ steps: - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_request.py @@ -353,6 +376,20 @@ steps: - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine +# TODO: Add the "V1 Test attetion (MI300)" test group + +- label: V1 Test attention (H100) # 10min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -479,10 +516,11 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/test_multimodal_compile.py - pytest -v -s compile/piecewise/ -- label: PyTorch Fullgraph Test # 22min - timeout_in_minutes: 35 +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking @@ -491,8 +529,23 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py - - pytest -v -s compile/test_fusions_e2e.py + - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + +- label: Cudagraph test + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 @@ -544,6 +597,8 @@ steps: - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 @@ -562,10 +617,13 @@ steps: - label: Model Executor Test # 23min timeout_in_minutes: 35 + torch_nightly: true mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py - vllm/model_executor - tests/model_executor - tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -861,9 +919,10 @@ steps: - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Accuracy Eval (Small Models) # 10min + timeout_in_minutes: 70 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 - timeout_in_minutes: 15 + # grade: Blocking working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - vllm/multimodal/ @@ -934,6 +993,7 @@ steps: - label: Transformers Nightly Models Test mirror_hardwares: [amdexperimental] agent_pool: mi325_1 + # grade: Blocking working_dir: "/vllm-workspace/" optional: true commands: @@ -961,11 +1021,16 @@ steps: - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py @@ -1002,7 +1067,33 @@ steps: - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - pytest -v -s tests/compile/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + +- label: Blackwell Fusion E2E Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusions_e2e.py + - tests/compile/test_full_graph.py + commands: + - nvidia-smi + # Run all e2e fusion tests - pytest -v -s tests/compile/test_fusions_e2e.py + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1253,6 +1344,7 @@ steps: - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental] agent_pool: mi325_4 + # grade: Blocking timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -1267,6 +1359,9 @@ steps: ##### A100 test ##### - label: Distributed Tests (A100) # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking gpu: a100 optional: true num_gpus: 4 @@ -1281,6 +1376,9 @@ steps: - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking gpu: a100 optional: true num_gpus: 4 @@ -1292,8 +1390,27 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + ##### H200 test ##### - label: Distributed Tests (H200) # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking gpu: h200 optional: true working_dir: "/vllm-workspace/" @@ -1305,6 +1422,7 @@ steps: - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1315,6 +1433,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min @@ -1330,3 +1449,27 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Accuracy + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 From 4d5943bda63c306275afc1a10edee26da45cd4ef Mon Sep 17 00:00:00 2001 From: Hank_ <37239608+ILikeIneine@users.noreply.github.com> Date: Fri, 14 Nov 2025 09:24:10 +0800 Subject: [PATCH 031/578] [quantization][config] enable override existing quant_config (#28510) Signed-off-by: Hank Co-authored-by: Michael Goin --- .../test_register_quantization_config.py | 12 +++++++++--- .../model_executor/layers/quantization/__init__.py | 14 +++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index aeef4c2fd8a7..8da048703df9 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -7,6 +7,7 @@ Run `pytest tests/quantization/test_register_quantization_config.py`. """ +import logging from typing import Any import pytest @@ -100,17 +101,22 @@ def get_quant_method( return None -def test_register_quantization_config(): +def test_register_quantization_config(caplog_vllm): """Test register custom quantization config.""" # The quantization method `custom_quant` should be registered. assert get_quantization_config("custom_quant") == CustomQuantConfig # The quantization method `custom_quant` is already exists, - # should raise an error. - with pytest.raises(ValueError): + # should raise a warning when re-registering it. + with caplog_vllm.at_level(logging.WARNING): register_quantization_config("custom_quant")(CustomQuantConfig) + assert any( + "The quantization method 'custom_quant' already exists" in message + for message in caplog_vllm.messages + ), "Expected a warning when re-registering custom_quant" + @pytest.mark.parametrize( argnames="model", diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index b92fb8d266b7..bb42b10f8718 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -3,8 +3,11 @@ from typing import Literal, get_args +from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +logger = init_logger(__name__) + QuantizationMethods = Literal[ "awq", "deepspeedfp", @@ -70,15 +73,20 @@ def register_quantization_config(quantization: str): def _wrapper(quant_config_cls): if quantization in QUANTIZATION_METHODS: - raise ValueError( - f"The quantization method `{quantization}` is already exists." + logger.warning( + "The quantization method '%s' already exists and will be " + "overwritten by the quantization config %s.", + quantization, + quant_config_cls, ) + else: + QUANTIZATION_METHODS.append(quantization) + if not issubclass(quant_config_cls, QuantizationConfig): raise ValueError( "The quantization config must be a subclass of `QuantizationConfig`." ) _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls - QUANTIZATION_METHODS.append(quantization) return quant_config_cls return _wrapper From 2aa75c752bdd9ce3ebc994353fa49146caad1940 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 13 Nov 2025 17:24:28 -0800 Subject: [PATCH 032/578] [ROCm] Bump up the version of amd-smi to 6.4.3 (#28680) Signed-off-by: Sage Moore --- requirements/rocm-build.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 51f58e57a785..b977e80be067 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -13,5 +13,5 @@ setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 wheel jinja2>=3.1.6 -amdsmi==6.2.4 +amdsmi==6.4.3 timm>=1.0.17 From 622e6106a9e3d64fb4927e3d9dc6e4f5289d174c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 13 Nov 2025 20:49:55 -0500 Subject: [PATCH 033/578] [CPU][Bugfix] Fix Apple Silicon M1 compilation failure (#28681) Signed-off-by: mgoin --- csrc/cpu/cpu_attn_impl.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index c317453530af..5de8a114b2b5 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -5,6 +5,10 @@ #include #include +#if defined(__APPLE__) + #include +#endif + #include "cpu_types.hpp" #include "scratchpad_manager.h" #include "cpu_attn_macros.h" @@ -741,9 +745,21 @@ class AttentionScheduler { static int64_t get_available_l2_size() { static int64_t size = []() { +#if defined(__APPLE__) + // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. + int64_t l2_cache_size = 0; + size_t len = sizeof(l2_cache_size); + if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && + l2_cache_size > 0) { + return l2_cache_size >> 1; // use 50% of L2 cache + } + // Fallback if sysctlbyname fails + return 128 * 1024 >> 1; // use 50% of 128KB +#else long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); TORCH_CHECK_NE(l2_cache_size, -1); return l2_cache_size >> 1; // use 50% of L2 cache +#endif }(); return size; } @@ -816,10 +832,14 @@ struct VecTypeTrait { using vec_t = vec_op::FP32Vec16; }; +// ARM only supports BF16 with ARMv8.6-A extension +#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)) +#else template <> struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; +#endif #if !defined(__powerpc__) template <> @@ -1588,9 +1608,17 @@ class AttentionMainLoop { if (use_sink) { alignas(64) float s_aux_fp32[16]; +#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT) + // ARM without native BF16 support: manual conversion + for (int i = 0; i < 16; ++i) { + s_aux_fp32[i] = static_cast(curr_s_aux[i]); + } +#else + // All other platforms have BF16Vec16 available vec_op::BF16Vec16 vec_bf16(curr_s_aux); vec_op::FP32Vec16 vec_fp32(vec_bf16); vec_fp32.save(s_aux_fp32); +#endif float* __restrict__ curr_sum_buffer = sum_buffer; float* __restrict__ curr_max_buffer = max_buffer; From b39a5026ebac9242740e48debc79ce8db92c868b Mon Sep 17 00:00:00 2001 From: Bradley D Date: Thu, 13 Nov 2025 18:44:36 -0800 Subject: [PATCH 034/578] [ci][amd] fix basic models extra init test (#28676) Signed-off-by: Bradley Davis --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 0e5b21ddf25b..864eb470bb0a 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -59,7 +59,7 @@ while true; do fi done -echo "--- Pulling container" +echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" @@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then exit 1 fi -# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then - # assign job count as the number of shards used - commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + # assign job count as the number of shards used + commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g') for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do # assign shard-id for each shard - commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g') echo "Shard ${GPU} commands:$commands_gpu" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ From 01bea115c426a86c5e565a1fc0b9563f58e0bd1a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Nov 2025 11:10:10 +0800 Subject: [PATCH 035/578] [Misc] Remove `warn_for_unimplemented_methods` (#28613) Signed-off-by: DarkLight1337 --- vllm/utils/__init__.py | 45 ----------------------------------- vllm/v1/worker/worker_base.py | 2 -- 2 files changed, 47 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9b0045279a67..040c0416c5ea 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import inspect import uuid import warnings -from functools import wraps from typing import Any, TypeVar import torch @@ -69,49 +67,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -def warn_for_unimplemented_methods(cls: type[T]) -> type[T]: - """ - A replacement for `abc.ABC`. - When we use `abc.ABC`, subclasses will fail to instantiate - if they do not implement all abstract methods. - Here, we only require `raise NotImplementedError` in the - base class, and log a warning if the method is not implemented - in the subclass. - """ - - original_init = cls.__init__ - - def find_unimplemented_methods(self: object): - unimplemented_methods = [] - for attr_name in dir(self): - # bypass inner method - if attr_name.startswith("_"): - continue - - try: - attr = getattr(self, attr_name) - # get the func of callable method - if callable(attr): - attr_func = attr.__func__ - except AttributeError: - continue - src = inspect.getsource(attr_func) - if "NotImplementedError" in src: - unimplemented_methods.append(attr_name) - if unimplemented_methods: - method_names = ",".join(unimplemented_methods) - msg = f"Methods {method_names} not implemented in {self}" - logger.debug(msg) - - @wraps(original_init) - def wrapped_init(self, *args, **kwargs) -> None: - original_init(self, *args, **kwargs) - find_unimplemented_methods(self) - - type.__setattr__(cls, "__init__", wrapped_init) - return cls - - def length_from_prompt_token_ids_or_embeds( prompt_token_ids: list[int] | None, prompt_embeds: torch.Tensor | None, diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 30ea0ab77bd9..3991c16eefba 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -13,7 +13,6 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import worker_receiver_cache_from_config -from vllm.utils import warn_for_unimplemented_methods from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.system_utils import update_environment_variables from vllm.v1.kv_cache_interface import KVCacheSpec @@ -33,7 +32,6 @@ _R = TypeVar("_R") -@warn_for_unimplemented_methods class WorkerBase: """Worker interface that allows vLLM to cleanly separate implementations for different hardware. Also abstracts control plane communication, e.g., to From da14ae0fad3165b88fcdc03a8f59f1813f8e832a Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 14 Nov 2025 11:15:50 +0800 Subject: [PATCH 036/578] [XPU][CI]disable lm cache uts (#28696) Signed-off-by: Kunshang Ji --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 27ed67c4517e..d49f3e2f47cf 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -46,6 +46,6 @@ docker run \ pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/test_serial_utils.py ' From 0aecd9138f45f6f687858ac1e0c5206d30c8425e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 13 Nov 2025 21:52:53 -0800 Subject: [PATCH 037/578] [Misc] Update xformers to 0.33.0.post1 (#28678) Signed-off-by: Roger Wang --- requirements/cuda.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 76874cbd2f48..d63fe9e1e77c 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,6 +9,6 @@ torch==2.9.0 torchaudio==2.9.0 # These must be updated alongside torch torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9 +xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9 # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.2 From 0b25498990f01ea2553c02731d6e2ce2d550156a Mon Sep 17 00:00:00 2001 From: haoyangli-amd Date: Fri, 14 Nov 2025 13:56:35 +0800 Subject: [PATCH 038/578] [Misc] add ignore mapper for quark quantization (#28275) Signed-off-by: Haoyang Li --- .../layers/quantization/quark/quark.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 095a66ef10f9..1bb698faf46d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import Any, Optional, cast +from typing import TYPE_CHECKING, Any, Optional, cast import torch @@ -34,6 +34,9 @@ ) from vllm.platforms import current_platform +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + __all__ = ["QuarkLinearMethod"] logger = init_logger(__name__) @@ -54,6 +57,7 @@ def __init__( self.kv_cache_group = kv_cache_group self.kv_cache_config = kv_cache_config self.pack_method = pack_method + self.ignore: list[str] = cast(list[str], self.quant_config.get("exclude", [])) def get_linear_method(self) -> "QuarkLinearMethod": return QuarkLinearMethod(self) @@ -74,9 +78,8 @@ def get_quant_method( from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization. - exclude_layers = cast(list[str], self.quant_config.get("exclude")) if should_ignore_layer( - prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping + prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping ): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): @@ -90,6 +93,9 @@ def get_quant_method( return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix) return None + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) + @classmethod def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": export_config = config.get("export") From 15ae8e0784d3889c6aa2c487ca00df4e3fde6f44 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 14 Nov 2025 00:34:01 -0600 Subject: [PATCH 039/578] [Bugfix][CI/Test][Spec Decode] Fix illegal memory access in offline_inference/spec_decode.py (Issue 27619) (#28432) Signed-off-by: Randall Smith Co-authored-by: Randall Smith Co-authored-by: TJian --- vllm/attention/ops/triton_reshape_and_cache_flash.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py index bbcd560ad56e..5d2ba154ae01 100644 --- a/vllm/attention/ops/triton_reshape_and_cache_flash.py +++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py @@ -97,7 +97,6 @@ def triton_reshape_and_cache_flash( k_scale: torch.Tensor, # float32 v_scale: torch.Tensor, # float32 ): - num_tokens = key.shape[0] num_heads = key.shape[1] head_size = key.shape[2] block_size = key_cache.shape[1] @@ -155,7 +154,10 @@ def triton_reshape_and_cache_flash( # TODO(ngl): maybe replace with static launch grid to avoid overhead if # using cudagraphs - grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"])) + grid = lambda meta: ( + slot_mapping.shape[0], + triton.cdiv(n, meta["TILE_SIZE"]), + ) reshape_and_cache_kernel_flash[grid]( key_ptr=key, From 93103575ce0480f36fc1a3603eb51d9a89f38a00 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 14 Nov 2025 00:41:29 -0600 Subject: [PATCH 040/578] [BugFix][CI/Build][ROCM] Fix import error and apply assert in appropriate case in test_struct_output_generate (#28311) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../v1/entrypoints/llm/test_struct_output_generate.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 4cd26e7b41d3..a7d769c8542a 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -677,9 +677,14 @@ def test_structured_output_with_reasoning_matrices( reasoning, content = run_reasoning_extraction(reasoner, [generated_text]) print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}") - assert content is not None and reasoning is not None - output_json = json.loads(content) - jsonschema.validate(instance=output_json, schema=reasoning_schema) + if "Qwen3" in model_name: + assert content is not None + + assert reasoning is not None + + if content is not None: + output_json = json.loads(content) + jsonschema.validate(instance=output_json, schema=reasoning_schema) @pytest.mark.skip_global_cleanup From 529cea343da8662f135a69d9c3157f388f5eb64a Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Fri, 14 Nov 2025 16:55:29 +0800 Subject: [PATCH 041/578] use default CCL_ZE_IPC_EXCHANGE (#28700) Signed-off-by: Yan Ma --- vllm/v1/worker/xpu_worker.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 31fa3f3bd6ac..26c6f8d06bdc 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -159,12 +159,10 @@ def init_device(self): else: raise RuntimeError(f"Not support device type: {self.device_config.device}") - ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd") ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") ENV_LOCAL_WORLD_SIZE = os.getenv( "LOCAL_WORLD_SIZE", str(self.parallel_config.world_size) ) - os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE os.environ["LOCAL_RANK"] = str(self.local_rank) From c36bcfe6b37967ab52763f2ddb9400ff4fe3885b Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Fri, 14 Nov 2025 17:01:26 +0800 Subject: [PATCH 042/578] [Bugfix] fix dots.ocr pp support (#28705) Signed-off-by: zjy0516 --- vllm/model_executor/models/dots_ocr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 25e5588961a6..405af8f8be42 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -780,6 +780,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): architectures=["Qwen2ForCausalLM"], ) + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + def _parse_and_validate_image_input( self, **kwargs: object ) -> DotsOCRImageInputs | None: From bc3e43069aadb1fa301a9f60a22872b6ec4453b9 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 14 Nov 2025 01:11:13 -0800 Subject: [PATCH 043/578] [BugFix] Fix multi-modal async scheduling race condition (#28706) Signed-off-by: Nick Hill --- .../shm_object_storage.py | 6 +-- vllm/v1/serial_utils.py | 26 ++++++++---- vllm/v1/worker/gpu_model_runner.py | 42 +++++++++---------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py index 2ec33afb8783..4af2caa16b0d 100644 --- a/vllm/distributed/device_communicators/shm_object_storage.py +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -342,8 +342,8 @@ def __init__(self): from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder self.encoder = MsgpackEncoder() - self.tensor_decoder = MsgpackDecoder(torch.Tensor) - self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem) + self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False) + self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False) self._mm_kwargs_item_cls = MultiModalKwargsItem def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]: @@ -368,7 +368,7 @@ def deserialize(self, data_view: memoryview) -> Any: # pickle.loads do not read past the end of a pickled object # within a large buffer, so we can skip storing the metadata size type_name, nbytes, len_arr = pickle.loads(data_view) - serialized_data = bytearray(data_view[-nbytes:]) + serialized_data = data_view[-nbytes:] if type_name == torch.Tensor.__name__: obj = [] diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 102357ca7c64..cf0b1a41b50f 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -31,6 +31,7 @@ MultiModalSharedField, NestedTensors, ) +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.engine import UtilityResult from vllm.v1.utils import tensor_data @@ -282,7 +283,9 @@ class MsgpackDecoder: not thread-safe when encoding tensors / numpy arrays. """ - def __init__(self, t: Any | None = None): + def __init__(self, t: Any | None = None, share_mem: bool = True): + self.share_mem = share_mem + self.pin_tensors = is_pin_memory_available() args = () if t is None else (t,) self.decoder = msgpack.Decoder( *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook @@ -347,21 +350,30 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray: # zero-copy decode. We assume the ndarray will not be kept around, # as it now locks the whole received message buffer in memory. buffer = self.aux_buffers[data] if isinstance(data, int) else data - return np.frombuffer(buffer, dtype=dtype).reshape(shape) + arr = np.frombuffer(buffer, dtype=dtype) + if not self.share_mem: + arr = arr.copy() + return arr.reshape(shape) def _decode_tensor(self, arr: Any) -> torch.Tensor: dtype, shape, data = arr - # Copy from inline representation, to decouple the memory storage - # of the message from the original buffer. And also make Torch - # not complain about a readonly memoryview. - buffer = self.aux_buffers[data] if isinstance(data, int) else bytearray(data) + is_aux = isinstance(data, int) + buffer = self.aux_buffers[data] if is_aux else data + buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer) torch_dtype = getattr(torch, dtype) assert isinstance(torch_dtype, torch.dtype) - if not buffer: # torch.frombuffer doesn't like empty buffers + if not buffer.nbytes: # torch.frombuffer doesn't like empty buffers assert 0 in shape return torch.empty(shape, dtype=torch_dtype) # Create uint8 array arr = torch.frombuffer(buffer, dtype=torch.uint8) + # Clone ensures tensor is backed by pytorch-owned memory for safe + # future async CPU->GPU transfer. + # Pin larger tensors for more efficient CPU->GPU transfer. + if not is_aux: + arr = arr.clone() + elif not self.share_mem: + arr = arr.pin_memory() if self.pin_tensors else arr.clone() # Convert back to proper shape & type return arr.view(torch_dtype).view(shape) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c9c64137ca04..d0f7f3a501f5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2590,28 +2590,28 @@ def execute_model( ) ) - dp_rank = self.parallel_config.data_parallel_rank - if ubatch_slices: - assert num_tokens_across_dp is not None - num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) - self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens) - elif num_tokens_across_dp is not None: - num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) - else: - num_input_tokens = self._get_num_input_tokens( - scheduler_output.total_num_scheduled_tokens - ) + dp_rank = self.parallel_config.data_parallel_rank + if ubatch_slices: + assert num_tokens_across_dp is not None + num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) + self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens) + elif num_tokens_across_dp is not None: + num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) + else: + num_input_tokens = self._get_num_input_tokens( + scheduler_output.total_num_scheduled_tokens + ) - ( - input_ids, - inputs_embeds, - positions, - intermediate_tensors, - model_kwargs, - ec_connector_output, - ) = self._preprocess( - scheduler_output, num_input_tokens, intermediate_tensors - ) + ( + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ec_connector_output, + ) = self._preprocess( + scheduler_output, num_input_tokens, intermediate_tensors + ) uniform_decode = ( max_num_scheduled_tokens == self.uniform_decode_query_len From c9a3a02149d83cc2840769228c4e591d39351bb6 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 14 Nov 2025 04:32:03 -0500 Subject: [PATCH 044/578] Add output token counting to gsm8k eval (#28594) Signed-off-by: mgoin --- tests/evals/gsm8k/gsm8k_eval.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py index c7799607912b..0421f8bb1859 100644 --- a/tests/evals/gsm8k/gsm8k_eval.py +++ b/tests/evals/gsm8k/gsm8k_eval.py @@ -83,8 +83,12 @@ async def call_vllm_api( stop: list[str] | None = None, url: str | None = None, seed: int | None = None, -) -> str: - """Call vLLM's OpenAI-compatible completions endpoint.""" +) -> tuple[str, int]: + """Call vLLM's OpenAI-compatible completions endpoint. + + Returns: + Tuple of (response_text, completion_tokens) + """ data = { "prompt": prompt, "temperature": temperature, @@ -98,10 +102,12 @@ async def call_vllm_api( async with session.post(f"{url}/v1/completions", json=data) as response: response.raise_for_status() result = await response.json() - return result["choices"][0]["text"] + text = result["choices"][0]["text"] + completion_tokens = result.get("usage", {}).get("completion_tokens", 0) + return text, completion_tokens except Exception as e: print(f"Error calling vLLM API: {e}") - return "" + return "", 0 def evaluate_gsm8k( @@ -146,10 +152,11 @@ def evaluate_gsm8k( # Run evaluation async def run_async_evaluation(): states: list[str] = [""] * num_questions + output_tokens: list[int] = [0] * num_questions - async def get_answer(session: aiohttp.ClientSession, i: int) -> str: + async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]: prompt = few_shot_examples + questions[i] - answer = await call_vllm_api( + answer, tokens = await call_vllm_api( session=session, prompt=prompt, temperature=temperature, @@ -159,7 +166,8 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str: seed=seed, ) states[i] = answer - return answer + output_tokens[i] = tokens + return answer, tokens async with aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=600) @@ -167,24 +175,28 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str: tasks = [get_answer(session, i) for i in range(num_questions)] await tqdm.gather(*tasks, desc="Evaluating") - return states + return states, output_tokens print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot") tic = time.perf_counter() - states = asyncio.run(run_async_evaluation()) + states, output_tokens = asyncio.run(run_async_evaluation()) latency = time.perf_counter() - tic # Compute metrics preds = [get_answer_value(state) for state in states] accuracy = np.mean(np.array(preds) == np.array(labels)) invalid_rate = np.mean(np.array(preds) == INVALID) + total_output_tokens = sum(output_tokens) + tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0 result = { "accuracy": accuracy, "invalid_rate": invalid_rate, "latency": latency, "questions_per_second": num_questions / latency, + "total_output_tokens": total_output_tokens, + "tokens_per_second": tokens_per_second, "num_questions": num_questions, "num_shots": num_shots, "max_tokens": max_tokens, @@ -236,6 +248,8 @@ def main() -> None: print(f"Invalid responses: {result['invalid_rate']:.3f}") print(f"Total latency: {result['latency']:.3f} s") print(f"Questions per second: {result['questions_per_second']:.3f}") + print(f"Total output tokens: {result['total_output_tokens']}") + print(f"Output tokens per second: {result['tokens_per_second']:.3f}") # Optional file saving if args.save_results: From fd75d3e8c0f522178e39845276fd57908760b4d0 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 14 Nov 2025 01:32:31 -0800 Subject: [PATCH 045/578] [Minor] avoid register new custom and just import silly_attn (#28578) Signed-off-by: Boyuan Feng --- tests/compile/test_config.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index bb66ef5529b1..1e8a882a7f3e 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -15,6 +15,9 @@ from vllm.platforms import current_platform from vllm.utils.torch_utils import _is_torch_equal_or_newer +# This import automatically registers `torch.ops.silly.attention` +from . import silly_attention # noqa: F401 + def test_version(): # Test the version comparison logic using the private function @@ -257,15 +260,6 @@ def test_should_split(): splitting_ops = ["aten::add.Tensor"] assert not should_split(node, splitting_ops) - @torch.library.custom_op( - "silly::attention", - mutates_args=["out"], - ) - def attention( - q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor - ) -> None: - out.copy_(q + k + v) - q, k, v, out = [torch.randn(1)] * 4 # supports custom ops as OpOverloadPacket From 8cfbe89b9389e5a10ee08059e6b2855e6c979e4e Mon Sep 17 00:00:00 2001 From: Xing Liu <93360308+xingliu14@users.noreply.github.com> Date: Fri, 14 Nov 2025 01:32:46 -0800 Subject: [PATCH 046/578] [Misc] fix comment in test_envs (#28529) Signed-off-by: Xing Liu --- tests/test_envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index 841d7945f912..6a9835a68e7e 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -36,7 +36,7 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): # Enable envs cache and ignore ongoing environment changes enable_envs_cache() - # __getattr__ is not decorated with functools.cache + # __getattr__ is decorated with functools.cache assert hasattr(envs.__getattr__, "cache_info") start_hits = envs.__getattr__.cache_info().hits From ecf8230d4d196566a76c907949d6569b1ff176ad Mon Sep 17 00:00:00 2001 From: lyn610 <610lyn@gmail.com> Date: Fri, 14 Nov 2025 17:47:45 +0800 Subject: [PATCH 047/578] [Metrics] Log number of preempted requests (#28522) Add tracking and periodic logging for the number of preempted requests in the metrics logger. This helps monitor system behavior under load. Signed-off-by: Yining Liu <610lyn@gmail.com> --- vllm/v1/metrics/loggers.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 1a175e9e110b..21280b9c84cf 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -118,12 +118,14 @@ def _reset(self, now): self.num_prompt_tokens: int = 0 self.num_generation_tokens: int = 0 self.num_corrupted_reqs: int = 0 + self.num_preemptions: int = 0 def _track_iteration_stats(self, iteration_stats: IterationStats): # Save tracked stats for token counters. self.num_prompt_tokens += iteration_stats.num_prompt_tokens self.num_generation_tokens += iteration_stats.num_generation_tokens self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs + self.num_preemptions += iteration_stats.num_preempted_reqs def _get_throughput(self, tracked_stats: int, now: float) -> float: # Compute summary metrics for tracked stats @@ -196,18 +198,31 @@ def log(self): "Avg generation throughput: %.1f tokens/s", "Running: %d reqs", "Waiting: %d reqs", - "GPU KV cache usage: %.1f%%", - "Prefix cache hit rate: %.1f%%", ] log_args = [ self.last_prompt_throughput, self.last_generation_throughput, self.last_scheduler_stats.num_running_reqs, self.last_scheduler_stats.num_waiting_reqs, - self.last_scheduler_stats.kv_cache_usage * 100, - self.prefix_caching_metrics.hit_rate * 100, ] + if self.num_preemptions > 0: + log_parts.append("Preemptions: %d") + log_args.append(self.num_preemptions) + + log_parts.extend( + [ + "GPU KV cache usage: %.1f%%", + "Prefix cache hit rate: %.1f%%", + ] + ) + log_args.extend( + [ + self.last_scheduler_stats.kv_cache_usage * 100, + self.prefix_caching_metrics.hit_rate * 100, + ] + ) + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: log_parts.append("Corrupted: %d reqs") log_args.append(self.num_corrupted_reqs) From 360bd8762f053c59ee19e2fd72cb1e5a28423958 Mon Sep 17 00:00:00 2001 From: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com> Date: Fri, 14 Nov 2025 03:03:55 -0800 Subject: [PATCH 048/578] [Frontend] Added chat-style multimodal support to /classify. (#27516) Signed-off-by: WorldExplored Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com> Signed-off-by: vnadathur Signed-off-by: wang.yuqi Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: vnadathur Co-authored-by: wang.yuqi Co-authored-by: wang.yuqi --- .../pooling/openai/test_classification.py | 10 ++ .../openai/test_vision_classification.py | 95 ++++++++++++++ vllm/entrypoints/openai/api_server.py | 3 + vllm/entrypoints/openai/protocol.py | 116 +++++++++++++++++- .../openai/serving_classification.py | 100 ++++++++++++--- vllm/entrypoints/openai/serving_engine.py | 21 +++- 6 files changed, 318 insertions(+), 27 deletions(-) create mode 100644 tests/entrypoints/pooling/openai/test_vision_classification.py diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py index 671bb948780a..25080d4189c2 100644 --- a/tests/entrypoints/pooling/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str assert hasattr(output.data[0], "probs") +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str): + response = requests.post( + server.url_for("classify"), + json={"model": model_name, "input": "hello", "add_special_tokens": False}, + ) + response.raise_for_status() + ClassificationResponse.model_validate(response.json()) + + @pytest.mark.parametrize("model_name", [MODEL_NAME]) def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str): input_texts = [ diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/openai/test_vision_classification.py new file mode 100644 index 000000000000..f2616e057b17 --- /dev/null +++ b/tests/entrypoints/pooling/openai/test_vision_classification.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest +import requests + +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.openai.protocol import ClassificationResponse + +VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls" +MAXIMUM_VIDEOS = 1 +TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4" + +HF_OVERRIDES = { + "text_config": { + "architectures": ["Qwen2_5_VLForSequenceClassification"], + }, +} + + +@pytest.fixture(scope="module") +def server_vlm_classify(): + args = [ + "--runner", + "pooling", + "--max-model-len", + "5000", + "--enforce-eager", + "--limit-mm-per-prompt", + json.dumps({"video": MAXIMUM_VIDEOS}), + ] + + with RemoteOpenAIServer( + VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES + ) as remote_server: + yield remote_server + + +@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME]) +def test_classify_accepts_chat_text_only( + server_vlm_classify: RemoteOpenAIServer, model_name: str +) -> None: + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please classify this text request."}, + ], + } + ] + + response = requests.post( + server_vlm_classify.url_for("classify"), + json={"model": model_name, "messages": messages}, + ) + response.raise_for_status() + + output = ClassificationResponse.model_validate(response.json()) + + assert output.object == "list" + assert output.model == model_name + assert len(output.data) == 1 + assert len(output.data[0].probs) == 2 + assert output.usage.prompt_tokens == 22 + + +@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME]) +def test_classify_accepts_chat_video_url( + server_vlm_classify: RemoteOpenAIServer, model_name: str +) -> None: + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please classify this video."}, + {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}}, + ], + } + ] + + response = requests.post( + server_vlm_classify.url_for("classify"), + json={"model": model_name, "messages": messages}, + ) + response.raise_for_status() + + output = ClassificationResponse.model_validate(response.json()) + + assert output.object == "list" + assert output.model == model_name + assert len(output.data) == 1 + assert len(output.data[0].probs) == 2 + assert output.usage.prompt_tokens == 4807 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fbb2d32a229d..f30c6ef2cd0a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1784,6 +1784,9 @@ async def init_app_state( engine_client, state.openai_serving_models, request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, log_error_stack=args.log_error_stack, ) if "classify" in supported_tasks diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 69e757d4764d..45584df8b9e2 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2000,10 +2000,10 @@ class ScoreResponse(OpenAIBaseModel): usage: UsageInfo -class ClassificationRequest(OpenAIBaseModel): +class ClassificationCompletionRequest(OpenAIBaseModel): model: str | None = None input: list[str] | str - truncate_prompt_tokens: int | None = None + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None user: str | None = None # --8<-- [start:classification-extra-params] @@ -2015,7 +2015,21 @@ class ClassificationRequest(OpenAIBaseModel): "if the served model does not use priority scheduling." ), ) - + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) softmax: bool | None = Field( default=None, description="softmax will be deprecated, please use use_activation instead.", @@ -2040,6 +2054,102 @@ def to_pooling_params(self): ) +class ClassificationChatRequest(OpenAIBaseModel): + model: str | None = None + messages: list[ChatCompletionMessageParam] + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + user: str | None = None + + # --8<-- [start:chat-classification-extra-params] + add_generation_prompt: bool = Field( + default=False, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) + # --8<-- [end:chat-classification-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + use_activation=get_use_activation(self), + ) + + +ClassificationRequest: TypeAlias = ( + ClassificationCompletionRequest | ClassificationChatRequest +) + + class ClassificationData(OpenAIBaseModel): index: int label: str | None diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 45bbe732a680..167ee152fece 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -4,13 +4,17 @@ from http import HTTPStatus from typing import cast +import jinja2 import numpy as np from fastapi import Request -from typing_extensions import override from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ClassificationChatRequest, + ClassificationCompletionRequest, ClassificationData, ClassificationRequest, ClassificationResponse, @@ -32,7 +36,10 @@ class ClassificationMixin(OpenAIServing): - @override + chat_template: str | None + chat_template_content_format: ChatTemplateContentFormatOption + trust_request_chat_template: bool + async def _preprocess( self, ctx: ServeContext, @@ -42,31 +49,79 @@ async def _preprocess( and prepare model-specific inputs. """ ctx = cast(ClassificationServeContext, ctx) - if isinstance(ctx.request.input, str) and not ctx.request.input: - return self.create_error_response( - "Input cannot be empty for classification", - status_code=HTTPStatus.BAD_REQUEST, - ) - - if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0: - return None - try: ctx.tokenizer = await self.engine_client.get_tokenizer() - renderer = self._get_renderer(ctx.tokenizer) - ctx.engine_prompts = await renderer.render_prompt( - prompt_or_prompts=ctx.request.input, - config=self._build_render_config(ctx.request), - ) + request_obj = ctx.request + + if isinstance(request_obj, ClassificationChatRequest): + chat_request = request_obj + messages = chat_request.messages + trust_request_chat_template = getattr( + self, + "trust_request_chat_template", + False, + ) + ret = self._validate_chat_template( + request_chat_template=chat_request.chat_template, + chat_template_kwargs=chat_request.chat_template_kwargs, + trust_request_chat_template=trust_request_chat_template, + ) + if ret: + return ret + + ( + _, + _, + engine_prompts, + ) = await self._preprocess_chat( + cast(ChatCompletionRequest, chat_request), + ctx.tokenizer, + messages, + chat_template=( + chat_request.chat_template + or getattr(self, "chat_template", None) + ), + chat_template_content_format=cast( + ChatTemplateContentFormatOption, + getattr(self, "chat_template_content_format", "auto"), + ), + add_generation_prompt=False, + continue_final_message=False, + add_special_tokens=chat_request.add_special_tokens, + ) + ctx.engine_prompts = engine_prompts + + elif isinstance(request_obj, ClassificationCompletionRequest): + completion_request = request_obj + input_data = completion_request.input + if input_data in (None, ""): + return self.create_error_response( + "Input or messages must be provided", + status_code=HTTPStatus.BAD_REQUEST, + ) + if isinstance(input_data, list) and not input_data: + ctx.engine_prompts = [] + return None + + renderer = self._get_renderer(ctx.tokenizer) + prompt_input = cast(str | list[str], input_data) + ctx.engine_prompts = await renderer.render_prompt( + prompt_or_prompts=prompt_input, + config=self._build_render_config(completion_request), + ) + else: + return self.create_error_response( + "Invalid classification request type", + status_code=HTTPStatus.BAD_REQUEST, + ) return None - except (ValueError, TypeError) as e: + except (ValueError, TypeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - @override def _build_response( self, ctx: ServeContext, @@ -118,6 +173,7 @@ def _build_render_config(self, request: ClassificationRequest) -> RenderConfig: return RenderConfig( max_length=self.max_model_len, truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, ) @@ -130,6 +186,9 @@ def __init__( models: OpenAIServingModels, *, request_logger: RequestLogger | None, + chat_template: str | None = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", + trust_request_chat_template: bool = False, log_error_stack: bool = False, ) -> None: super().__init__( @@ -139,6 +198,10 @@ def __init__( log_error_stack=log_error_stack, ) + self.chat_template = chat_template + self.chat_template_content_format = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + async def create_classify( self, request: ClassificationRequest, @@ -156,7 +219,6 @@ async def create_classify( return await super().handle(ctx) # type: ignore - @override def _create_pooling_params( self, ctx: ClassificationServeContext, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1456727a3cdd..03f10e5a91e6 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -43,6 +43,8 @@ ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, + ClassificationChatRequest, + ClassificationCompletionRequest, ClassificationRequest, ClassificationResponse, CompletionRequest, @@ -114,13 +116,16 @@ | DetokenizeRequest | EmbeddingCompletionRequest | RerankRequest - | ClassificationRequest + | ClassificationCompletionRequest | ScoreRequest | TokenizeCompletionRequest ) ChatLikeRequest: TypeAlias = ( - ChatCompletionRequest | EmbeddingChatRequest | TokenizeChatRequest + ChatCompletionRequest + | EmbeddingChatRequest + | TokenizeChatRequest + | ClassificationChatRequest ) SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest AnyRequest: TypeAlias = ( @@ -814,7 +819,11 @@ def _get_message_types(self, request: AnyRequest) -> set[str]: if not hasattr(request, "messages"): return message_types - for message in request.messages: + messages = request.messages + if messages is None or isinstance(messages, (str, bytes)): + return message_types + + for message in messages: if ( isinstance(message, dict) and "content" in message @@ -907,7 +916,8 @@ def _validate_input( EmbeddingCompletionRequest, ScoreRequest, RerankRequest, - ClassificationRequest, + ClassificationCompletionRequest, + ClassificationChatRequest, ), ): # Note: input length can be up to the entire model context length @@ -915,7 +925,8 @@ def _validate_input( if token_num > self.max_model_len: operations: dict[type[AnyRequest], str] = { ScoreRequest: "score", - ClassificationRequest: "classification", + ClassificationCompletionRequest: "classification", + ClassificationChatRequest: "classification", } operation = operations.get(type(request), "embedding generation") raise ValueError( From 41b92f7d38d3f056004991c026f6a24846755ef4 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Fri, 14 Nov 2025 19:16:13 +0800 Subject: [PATCH 049/578] [Model][MM] Extract conv layer as CustomOp (#28455) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- vllm/model_executor/layers/conv.py | 236 ++++++++++++++++++ vllm/model_executor/models/clip.py | 3 +- vllm/model_executor/models/glm4_1v.py | 17 +- vllm/model_executor/models/qwen2_5_vl.py | 18 +- vllm/model_executor/models/qwen2_vl.py | 18 +- .../models/qwen3_omni_moe_thinker.py | 17 +- vllm/model_executor/models/qwen3_vl.py | 18 +- vllm/model_executor/models/vision.py | 16 -- 8 files changed, 277 insertions(+), 66 deletions(-) create mode 100644 vllm/model_executor/layers/conv.py diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py new file mode 100644 index 000000000000..e6f2d2990c24 --- /dev/null +++ b/vllm/model_executor/layers/conv.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Conv Layer Class.""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from vllm.model_executor.custom_op import CustomOp +from vllm.utils.torch_utils import is_torch_equal + + +class ConvLayerBase(CustomOp): + """Conv layer base class.""" + + num_dim: int + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int | tuple[int, ...], + stride: int | tuple[int, ...] = 1, + padding: int | tuple[int, ...] = 0, + dilation: int | tuple[int, ...] = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = "zeros", + *, + params_dtype: torch.dtype | None = None, + ) -> None: + super().__init__() + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + kernel_size = ( + (kernel_size,) * self.num_dim + if isinstance(kernel_size, int) + else kernel_size + ) + stride = (stride,) * self.num_dim if isinstance(stride, int) else stride + padding = (padding,) * self.num_dim if isinstance(padding, int) else padding + dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.padding_mode = padding_mode + + self.enable_linear = ( + (self.kernel_size == self.stride) + and not any(self.padding) + and self.groups == 1 + ) + self.input_size = in_channels * math.prod(self.kernel_size) + + self.weight = nn.Parameter( + torch.empty( + out_channels, + in_channels // groups, + *kernel_size, + dtype=params_dtype, + ), + ) + + if bias: + self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype)) + else: + self.register_parameter("bias", None) + + def extra_repr(self) -> str: + s = f"in_channels={self.in_channels}, " + s += f"out_channels={self.out_channels}, " + s += f"kernel_size={self.kernel_size}, " + s += f"stride={self.stride}, " + s += f"padding={self.padding}, " + s += f"bias={self.bias is not None}" + return s + + +@CustomOp.register("conv2d") +class Conv2dLayer(ConvLayerBase): + """Conv layer with Conv2d.""" + + num_dim = 2 + + def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 4 + B, C, H, W = x.shape + K1, K2 = self.kernel_size + H, W = H // K1, W // K2 + x = x.unfold(2, K1, K1).unfold(3, K2, K2) + x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size) + x = F.linear( + x, + self.weight.view(self.out_channels, self.input_size), + self.bias, + ) + x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2) + return x + + def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 4 + x = F.conv2d( + x, + self.weight, + self.bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + ) + return x + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """Expected input shape: (batch_size, in_channels, height, width)""" + assert x.dim() == 4 + if self.enable_linear: + return self._forward_mulmat(x) + else: + return self._forward_conv(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + # By default, we use CUDNN's convolution ops with optimization. + return self._forward_conv(x) + + +class CausalConv2dLayer(Conv2dLayer): + """ + A causal version of nn.Conv2d where each location in the 2D matrix would + have no access to locations on its right or down + All arguments are the same as nn.Conv2d except padding which should be + set as None + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int = 0, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = "zeros", + *, + params_dtype: torch.dtype | None = None, + ) -> None: + if padding is not None: + raise ValueError( + "Argument padding should be set to None for CausalConv2dLayer." + ) + self._left_padding: int = kernel_size - 1 + self._right_padding: int = stride - 1 + padding = 0 + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + params_dtype=params_dtype, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0)) + x = super().forward(x) + return x + + +@CustomOp.register("conv3d") +class Conv3dLayer(ConvLayerBase): + """Conv layer with Conv3d.""" + + num_dim = 3 + + def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 5 + B, C, T, H, W = x.shape + K1, K2, K3 = self.kernel_size + T, H, W = T // K1, H // K2, W // K3 + x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3) + x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size) + x = F.linear( + x, + self.weight.view(self.out_channels, self.input_size), + self.bias, + ) + x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3) + return x + + def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 5 + x = F.conv3d( + x, + self.weight, + self.bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + ) + return x + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """Expected input shape: (batch_size, in_channels, time, height, width)""" + if self.enable_linear: + return self._forward_mulmat(x) + else: + return self._forward_conv(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a + # significant performance regression. + # See: https://github.com/vllm-project/vllm/issues/27406 + # and https://github.com/pytorch/pytorch/issues/166122 + # By default, we use CUDNN's convolution ops with optimization. + if self.enable_linear and is_torch_equal("2.9.0"): + return self._forward_mulmat(x) + return self._forward_conv(x) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 50f476dfd185..5d611deb942d 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -20,6 +20,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -315,7 +316,7 @@ def __init__(self, config: CLIPVisionConfig): self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index b2d4fe0c0139..6953b805653b 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -56,12 +56,12 @@ from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger +from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -103,7 +103,6 @@ maybe_prefix, ) from .vision import ( - conv3d_to_linear_weight, get_vit_attn_backend, run_dp_sharded_mrope_vision_model, ) @@ -486,15 +485,18 @@ def __init__( self.hidden_size = hidden_size kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = ReplicatedLinear( - in_channels * math.prod(kernel_size), + self.proj = Conv3dLayer( + in_channels, hidden_size, + kernel_size=kernel_size, + stride=kernel_size, bias=True, - return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.proj(x) + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.hidden_size) return x @@ -893,9 +895,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 23591480b160..7617929e93ac 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -26,7 +26,6 @@ # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" -import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import lru_cache, partial from typing import Annotated, Any, Literal, TypeAlias @@ -56,12 +55,12 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -110,7 +109,6 @@ maybe_prefix, ) from .vision import ( - conv3d_to_linear_weight, get_vit_attn_backend, run_dp_sharded_mrope_vision_model, ) @@ -525,15 +523,18 @@ def __init__( self.hidden_size = hidden_size kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = ReplicatedLinear( - in_channels * math.prod(kernel_size), + self.proj = Conv3dLayer( + in_channels, hidden_size, + kernel_size=kernel_size, + stride=kernel_size, bias=False, - return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.proj(x) + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.hidden_size) return x @@ -957,9 +958,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 13b54bbe1748..5d21e249fc4c 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,7 +25,6 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from typing import Annotated, Any, Literal, TypeAlias @@ -54,9 +53,9 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -107,7 +106,6 @@ maybe_prefix, ) from .vision import ( - conv3d_to_linear_weight, get_vit_attn_backend, run_dp_sharded_mrope_vision_model, ) @@ -566,15 +564,18 @@ def __init__( self.embed_dim = embed_dim kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = ReplicatedLinear( - in_channels * math.prod(kernel_size), + self.proj = Conv3dLayer( + in_channels, embed_dim, + kernel_size=kernel_size, + stride=kernel_size, bias=False, - return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.proj(x) + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.embed_dim) return x @@ -844,9 +845,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 5df2372a842c..40b80ce2387c 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -22,7 +22,6 @@ # limitations under the License. """Inference-only Qwen3-Omni-Moe model (thinker part).""" -import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from typing import Any @@ -54,9 +53,9 @@ from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -102,7 +101,6 @@ maybe_prefix, ) from .vision import ( - conv3d_to_linear_weight, get_llm_pos_ids_for_vision, get_vit_attn_backend, ) @@ -138,16 +136,18 @@ def __init__( self.hidden_size = hidden_size kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = ReplicatedLinear( - in_channels * math.prod(kernel_size), + self.proj = Conv3dLayer( + in_channels, hidden_size, + kernel_size=kernel_size, + stride=kernel_size, bias=True, - return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: L, C = x.shape - x = self.proj(x) + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.hidden_size) return x @@ -566,9 +566,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 5f5bde1dd72d..faeb9f81d961 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -24,7 +24,6 @@ # limitations under the License. """Inference-only Qwen3VL model compatible with HuggingFace weights.""" -import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from itertools import islice @@ -57,9 +56,9 @@ from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -114,7 +113,6 @@ maybe_prefix, ) from .vision import ( - conv3d_to_linear_weight, get_vit_attn_backend, run_dp_sharded_mrope_vision_model, ) @@ -139,15 +137,18 @@ def __init__( self.hidden_size = hidden_size kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = ReplicatedLinear( - in_channels * math.prod(kernel_size), + self.proj = Conv3dLayer( + in_channels, hidden_size, + kernel_size=kernel_size, + stride=kernel_size, bias=True, - return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.proj(x) + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.hidden_size) return x @@ -579,9 +580,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 0e814e5c86ad..e5d70eb7bc2f 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -550,19 +550,3 @@ def get_llm_pos_ids_for_vision( llm_pos_ids_list.append(_llm_pos_ids + start_idx) llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) return llm_pos_ids - - -# Due to a performance regression with Conv3D in PyTorch2.9, we reshape -# Conv3D weights to Linear weights for better performance. -# See: https://github.com/vllm-project/vllm/issues/27406 -# and https://github.com/pytorch/pytorch/issues/166122 -# FIXME(Isotr0py): Revert the PR introduces this workaround -# (https://github.com/vllm-project/vllm/pull/27418), -# once the performance issue is resolved in PyTorch. -def conv3d_to_linear_weight(conv3d_weight: torch.Tensor) -> torch.Tensor: - """ - Reshape Conv3D weight to Linear weight. Only work when kernel_size==stride. - """ - out_channels, in_channels, kt, kh, kw = conv3d_weight.shape - linear_weight = conv3d_weight.reshape(out_channels, in_channels * kt * kh * kw) - return linear_weight From 4516d44b7f990b8f92450e73720b89cc8ac155ca Mon Sep 17 00:00:00 2001 From: Jingchun Gao <63247409+gjc0824@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:24:10 +0800 Subject: [PATCH 050/578] [DCP] Support Decode Context Parallel (DCP) for GQA with Flashinfer (#25438) Signed-off-by: gaojc <1055866782@qq.com> Signed-off-by: Jingchun Gao Signed-off-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com> Signed-off-by: QiuChunshuo Co-authored-by: gaojingchun (A) Co-authored-by: Jingchun Gao Co-authored-by: QiuChunshuo --- tests/distributed/test_context_parallel.py | 17 +- vllm/config/model.py | 8 + vllm/utils/flashinfer.py | 9 + vllm/v1/attention/backends/flashinfer.py | 343 ++++++++++++++++++--- vllm/v1/executor/multiproc_executor.py | 5 + 5 files changed, 331 insertions(+), 51 deletions(-) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 3576efca591c..b16fd0d06b14 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -39,6 +39,7 @@ class ParallelSetup(NamedTuple): class CPTestOptions(NamedTuple): multi_node_only: bool load_format: str | None = None + attn_backend: str | None = None @dataclass @@ -58,6 +59,7 @@ def detailed( multi_node_only: bool = False, runner: RunnerOption = "auto", load_format: str | None = None, + attn_backend: str | None = None, ): parallel_setups = [] for eager_mode_val in [False]: @@ -79,7 +81,9 @@ def detailed( distributed_backends=["mp"], runner=runner, test_options=CPTestOptions( - multi_node_only=multi_node_only, load_format=load_format + multi_node_only=multi_node_only, + load_format=load_format, + attn_backend=attn_backend, ), ) @@ -117,7 +121,7 @@ def _compare_cp_with_tp( chunked_prefill, ) = parallel_setup - multi_node_only, load_format = test_options + multi_node_only, load_format, attn_backend = test_options model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_transformers_version(on_fail="skip") @@ -177,6 +181,13 @@ def _compare_cp_with_tp( if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) + if not attn_backend: + cp_env = tp_env = {} + else: + cp_env = tp_env = { + "VLLM_ATTENTION_BACKEND": attn_backend, + } + cp_args = [ *common_args, "--tensor-parallel-size", @@ -205,6 +216,8 @@ def _compare_cp_with_tp( model_id, cp_args, tp_args, + cp_env, + tp_env, method=method, max_wait_seconds=720, ) diff --git a/vllm/config/model.py b/vllm/config/model.py index f4ed99689e5b..8ec66b6b3160 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1183,6 +1183,14 @@ def verify_with_parallel_config( f"but got {decode_context_parallel_size}" ) + num_q_per_kv = total_num_attention_heads // total_num_kv_heads + assert num_q_per_kv % decode_context_parallel_size == 0, ( + f"Total number of q per kv attn heads ({num_q_per_kv})" + " must be divisible by dcp world size when enable " + "decode context parallel for GQA " + f"({parallel_config.decode_context_parallel_size})." + ) + def get_sliding_window(self) -> int | None: """Get the sliding window size from the HF text config if present.""" return getattr(self.hf_text_config, "sliding_window", None) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 62af39513d65..79e5a4c30259 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -259,6 +259,7 @@ def use_trtllm_attention( num_kv_heads: int, num_tokens: int, max_seq_len: int, + dcp_world_size: int, kv_cache_dtype: str, q_dtype: torch.dtype, is_prefill: bool, @@ -272,6 +273,14 @@ def use_trtllm_attention( if force_use_trtllm is not None and not force_use_trtllm: return False + # Decode context parallel is not supported + if dcp_world_size > 1: + logger.warning_once( + "Trtllm does not support returning LSE and as a result " + "does not support DCP, reverting to FlashInfer" + ) + return False + # The platform is not supported if not supports_trtllm_attention(): if force_use_trtllm: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0b650e2e0d33..4da1637d96eb 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -10,6 +10,7 @@ from flashinfer import ( BatchDecodeWithPagedKVCacheWrapper, BatchPrefillWithPagedKVCacheWrapper, + BatchPrefillWithRaggedKVCacheWrapper, MultiLevelCascadeAttentionWrapper, ) from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache @@ -24,8 +25,11 @@ AttentionType, MultipleOf, ) +from vllm.attention.ops.common import cp_lse_ag_out_rs +from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.config import CUDAGraphMode, VllmConfig from vllm.config.cache import CacheDType +from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, @@ -50,6 +54,7 @@ AttentionMetadataBuilder, CommonAttentionMetadata, KVCacheLayoutType, + get_dcp_local_seq_lens, get_kv_cache_layout, get_per_layer_parameters, infer_global_hyperparameters, @@ -160,6 +165,113 @@ def trtllm_prefill_attn_kvfp8_dequant( return mock_kv_cache, mock_block_table +class BatchDCPPrefillWrapper: + def __init__( + self, + workspace_buffer: torch.Tensor | None = None, + ): + self._context = BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, get_kv_cache_layout() + ) + self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper( + workspace_buffer, get_kv_cache_layout() + ) + + def plan( + self, + qo_indptr_cpu: torch.Tensor, + paged_kv_indptr_cpu: torch.Tensor, + paged_kv_indices: torch.Tensor, + paged_kv_last_page_len_cpu: torch.Tensor, + prefill_start: int, + page_size: int, + num_qo_heads: int, + dcp_world_size: int, + num_kv_heads: int, + head_dim: int, + sm_scale: float, + window_left: int, + logits_soft_cap: float | None, + q_data_type: torch.dtype, + kv_cache_dtype: torch.dtype, + prefill_fixed_split_size: int, + disable_split_kv: bool, + ): + """Plan the prefill operation with given parameters.""" + self._context.plan( + qo_indptr_cpu, + paged_kv_indptr_cpu, + paged_kv_indices, + paged_kv_last_page_len_cpu[prefill_start:], + num_qo_heads * dcp_world_size, + num_kv_heads, + head_dim, + page_size, + causal=False, # This is context run + sm_scale=sm_scale, + window_left=window_left, + logits_soft_cap=logits_soft_cap, + q_data_type=q_data_type, + kv_data_type=kv_cache_dtype, + fixed_split_size=prefill_fixed_split_size, + disable_split_kv=disable_split_kv, + ) + self._new_tokens.plan( + qo_indptr=qo_indptr_cpu, + kv_indptr=qo_indptr_cpu, + num_qo_heads=num_qo_heads, + num_kv_heads=num_kv_heads, + head_dim_qk=head_dim, + head_dim_vo=head_dim, + causal=True, # This is newtokens run + sm_scale=sm_scale, + window_left=window_left, + logits_soft_cap=logits_soft_cap, + q_data_type=q_data_type, + ) + + def run( + self, + layer: torch.nn.Module, + prefill_query: torch.Tensor, + kv_cache_permute: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + ): + prefill_query_across_dcp = get_dcp_group().all_gather( + prefill_query.contiguous(), dim=1 + ) + output_context_tmp, lse_context_tmp = self._context.run( + prefill_query_across_dcp, + kv_cache_permute, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + return_lse=True, + ) + output_context, lse_context = cp_lse_ag_out_rs( + output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True + ) + lse_context = lse_context.transpose(0, 1).contiguous() + + output_query, lse_query = self._new_tokens.run( + prefill_query, + key, + value, + return_lse=True, + ) + lse_query = lse_query.transpose(0, 1).contiguous() + + merge_attn_states( + out, + output_context, + lse_context, + output_query, + lse_query, + ) + return out + + class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] @@ -281,7 +393,9 @@ class FlashInferMetadata: # For cascade attention (CPU for planning). use_cascade: bool - prefill_wrapper: BatchPrefillWithPagedKVCacheWrapper | None = None + prefill_wrapper: ( + BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None + ) = None decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None @@ -303,7 +417,9 @@ def __init__( self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config self._workspace_buffer = None - self._prefill_wrapper = None # Wrapper for prefill/append + self._prefill_wrapper: ( + BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None + ) = None # Wrapper for prefill/append self._decode_wrapper = None # Wrapper for decode (general shape) if vllm_is_batch_invariant(): @@ -341,9 +457,23 @@ def __init__( self.compilation_config.max_cudagraph_capture_size, ) - self.num_qo_heads = self.model_config.get_num_attention_heads( - self.vllm_config.parallel_config + try: + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + self.dcp_kv_cache_interleave_size = ( + vllm_config.parallel_config.dcp_kv_cache_interleave_size + ) + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 + self.dcp_kv_cache_interleave_size = 1 + + self.num_qo_heads = ( + self.model_config.get_num_attention_heads(self.vllm_config.parallel_config) + * self.dcp_world_size ) + self.num_kv_heads = self.kv_cache_spec.num_kv_heads self.head_dim = self.kv_cache_spec.head_size self.page_size = self.kv_cache_spec.block_size @@ -455,11 +585,19 @@ def _get_workspace_buffer(self): ) return self._workspace_buffer - def _get_prefill_wrapper(self): + def _get_prefill_wrapper( + self, + ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper: if self._prefill_wrapper is None: - self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper( - self._get_workspace_buffer(), get_kv_cache_layout() - ) + if self.dcp_world_size > 1: + self._prefill_wrapper = BatchDCPPrefillWrapper( + workspace_buffer=self._get_workspace_buffer(), + ) + else: + self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper( + self._get_workspace_buffer(), get_kv_cache_layout() + ) + assert self._prefill_wrapper is not None return self._prefill_wrapper def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False): @@ -526,9 +664,29 @@ def build( max_seq_len = common_attn_metadata.max_seq_len seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu - seq_lens_np = seq_lens_cpu.numpy() block_table_tensor = common_attn_metadata.block_table_tensor + qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu + if self.dcp_world_size > 1: + if num_prefills > 0: + qo_indptr_prefill_cpu = ( + qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes] + ) + query_lens_prefill_cpu = ( + qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1] + ) + seq_lens_cpu[num_decodes:] = ( + seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu + ) + + seq_lens_cpu = get_dcp_local_seq_lens( + seq_lens_cpu, + self.dcp_world_size, + self.dcp_rank, + self.dcp_kv_cache_interleave_size, + ) + + seq_lens_np = seq_lens_cpu.numpy() num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size use_cascade = common_prefix_len > 0 @@ -589,7 +747,7 @@ def build( # write self.paged_kv_last_page_len_cpu inplace paged_kv_last_page_len_np = seq_lens_np % page_size self.paged_kv_last_page_len_np[:num_reqs] = np.where( - paged_kv_last_page_len_np == 0, + (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0), page_size, paged_kv_last_page_len_np, ) @@ -600,13 +758,16 @@ def build( self.num_kv_heads, num_prefill_tokens, max_seq_len, + self.dcp_world_size, self.cache_dtype, self.q_data_type, is_prefill=True, has_sinks=self.has_sinks, has_spec=uses_spec_reorder, ) - decode_use_trtllm = self.use_trtllm_decode_attention + decode_use_trtllm = ( + self.use_trtllm_decode_attention and self.dcp_world_size <= 1 + ) if not (prefill_use_trtllm and decode_use_trtllm): if self.has_sinks: @@ -651,7 +812,6 @@ def build( use_cascade=use_cascade, ) - qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs] paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs] @@ -703,24 +863,52 @@ def build( attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item()) if not attn_metadata.prefill_use_trtllm: - attn_metadata.prefill_wrapper.plan( - qo_indptr_cpu, - paged_kv_indptr_cpu, - paged_kv_indices, - paged_kv_last_page_len_cpu[prefill_start:], - self.num_qo_heads, - self.num_kv_heads, - self.head_dim, - self.page_size, - causal=True, - sm_scale=self.sm_scale, - window_left=self.window_left, - logits_soft_cap=self.logits_soft_cap, - q_data_type=self.q_data_type, - kv_data_type=self.kv_cache_dtype, - fixed_split_size=self.prefill_fixed_split_size, - disable_split_kv=self.disable_split_kv, - ) + if self.dcp_world_size > 1: + assert isinstance( + attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper + ) + attn_metadata.prefill_wrapper.plan( + qo_indptr_cpu=qo_indptr_cpu, + paged_kv_indptr_cpu=paged_kv_indptr_cpu, + paged_kv_indices=paged_kv_indices, + paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu, + prefill_start=prefill_start, + page_size=self.page_size, + num_qo_heads=self.num_qo_heads, + dcp_world_size=self.dcp_world_size, + num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_cache_dtype=self.kv_cache_dtype, + prefill_fixed_split_size=self.prefill_fixed_split_size, + disable_split_kv=self.disable_split_kv, + ) + else: + assert isinstance( + attn_metadata.prefill_wrapper, + BatchPrefillWithPagedKVCacheWrapper, + ) + attn_metadata.prefill_wrapper.plan( + qo_indptr_cpu, + paged_kv_indptr_cpu, + paged_kv_indices, + paged_kv_last_page_len_cpu[prefill_start:], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.kv_cache_dtype, + fixed_split_size=self.prefill_fixed_split_size, + disable_split_kv=self.disable_split_kv, + ) else: attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to( self.device, non_blocking=True @@ -770,7 +958,7 @@ def build( paged_kv_indices, self.paged_kv_last_page_len_cpu[:num_input_tokens], seq_lens_cpu[:num_input_tokens], - self.num_qo_heads, + self.num_qo_heads * self.dcp_world_size, self.num_kv_heads, self.head_dim, self.page_size, @@ -797,6 +985,8 @@ def use_cascade_attention(self, *args, **kwargs) -> bool: class FlashInferImpl(AttentionImpl): + can_return_lse_for_decode: bool = True + def __init__( self, num_heads: int, @@ -989,6 +1179,8 @@ def forward( # Inputs and outputs may be padded for CUDA graphs query = query[:num_actual_tokens] + key = key[:num_actual_tokens] + value = value[:num_actual_tokens] output_padded = output output = output[:num_actual_tokens] @@ -1015,17 +1207,46 @@ def forward( assert prefill_wrapper is not None if not attn_metadata.prefill_use_trtllm: - assert prefill_wrapper._causal - assert prefill_wrapper._window_left == self.window_left - assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0) - assert prefill_wrapper._sm_scale == self.scale - prefill_wrapper.run( - prefill_query, - kv_cache_permute, - k_scale=layer._k_scale_float, - v_scale=layer._v_scale_float, - out=output[num_decode_tokens:], - ) + if self.dcp_world_size > 1: + assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper) + assert prefill_wrapper._context._window_left == self.window_left + assert prefill_wrapper._context._logits_soft_cap == ( + self.logits_soft_cap or 0.0 + ) + assert prefill_wrapper._context._sm_scale == self.scale + assert not prefill_wrapper._context._causal + assert prefill_wrapper._new_tokens._window_left == self.window_left + assert prefill_wrapper._new_tokens._logits_soft_cap == ( + self.logits_soft_cap or 0.0 + ) + assert prefill_wrapper._new_tokens._sm_scale == self.scale + assert prefill_wrapper._new_tokens._causal + + prefill_wrapper.run( + layer, + prefill_query, + kv_cache_permute, + key[num_decode_tokens:], + value[num_decode_tokens:], + out=output[num_decode_tokens:], + ) + else: + assert isinstance( + prefill_wrapper, BatchPrefillWithPagedKVCacheWrapper + ) + assert prefill_wrapper._window_left == self.window_left + assert prefill_wrapper._logits_soft_cap == ( + self.logits_soft_cap or 0.0 + ) + assert prefill_wrapper._sm_scale == self.scale + assert prefill_wrapper._causal + prefill_wrapper.run( + prefill_query, + kv_cache_permute, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + out=output[num_decode_tokens:], + ) else: # prefill_query may be non-contiguous prefill_query = prefill_query.contiguous() @@ -1101,13 +1322,37 @@ def forward( assert decode_wrapper._window_left == self.window_left assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0) assert decode_wrapper._sm_scale == self.scale - decode_wrapper.run( - decode_query, - kv_cache_permute, - k_scale=layer._k_scale_float, - v_scale=layer._v_scale_float, - out=output[:num_decode_tokens], - ) + + if self.dcp_world_size > 1: + decode_query = get_dcp_group().all_gather( + decode_query.contiguous(), dim=-2 + ) + output_tmp = torch.empty_like(decode_query) + lse = torch.empty( + (decode_query.size(0), decode_query.size(1)), + dtype=torch.float32, + device=decode_query.device, + ) + decode_wrapper.run( + decode_query, + kv_cache_permute, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + out=output_tmp, + lse=lse, + return_lse=True, + ) + output[:num_decode_tokens] = cp_lse_ag_out_rs( + output_tmp, lse, get_dcp_group() + ) + else: + decode_wrapper.run( + decode_query, + kv_cache_permute, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + out=output[:num_decode_tokens], + ) else: # decode_query may be non-contiguous decode_query = decode_query.contiguous() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 1e249161c688..881e6ef40aaf 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -31,6 +31,7 @@ from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.distributed.parallel_state import ( + get_dcp_group, get_dp_group, get_ep_group, get_pp_group, @@ -726,6 +727,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None: pp_rank = get_pp_group().rank_in_group tp_size = get_tp_group().world_size tp_rank = get_tp_group().rank_in_group + dcp_size = get_dcp_group().world_size + dcp_rank = get_dcp_group().rank_in_group process_name = "Worker" if dp_size > 1: process_name += f"_DP{dp_rank}" @@ -733,6 +736,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None: process_name += f"_PP{pp_rank}" if tp_size > 1: process_name += f"_TP{tp_rank}" + if dcp_size > 1: + process_name += f"_DCP{dcp_rank}" if enable_ep: ep_rank = get_ep_group().rank_in_group process_name += f"_EP{ep_rank}" From 9324e10275cce6e0fd189bf1ebb0c399d858e9e1 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 14 Nov 2025 01:53:42 -1000 Subject: [PATCH 051/578] Fix KV sharing fast prefill with cudagraph enabled (#28537) Signed-off-by: Yong Hoon Shin Co-authored-by: Cyrus Leung --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 57 +++++--------------- vllm/v1/attention/backends/utils.py | 15 +----- vllm/v1/worker/gpu_model_runner.py | 2 +- 3 files changed, 17 insertions(+), 57 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index f2c6d1c1fd1a..2778b0c5e567 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -4,13 +4,11 @@ import random import pytest -import torch from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationMode -from vllm.distributed import cleanup_dist_env_and_memory -from ...utils import fork_new_process_for_each_test +from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts # global seed SEED = 42 @@ -45,28 +43,12 @@ def test_prompts(): return prompts -def cleanup(llm: LLM, compilation_config: CompilationConfig): - # hacky: below lines are required to free up memory for the next test - # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient - # TODO(sarckk): when enforce_eager=False, memory is not freed: - # find out why and re-enable test for enforce_eager=False case - llm_engine = llm.llm_engine.engine_core.engine_core - model_runner = llm_engine.model_executor.driver_worker.worker.model_runner - del model_runner.model - del model_runner.kv_caches - del compilation_config.static_forward_context - compilation_config.static_forward_context = {} - - del llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() - - @fork_new_process_for_each_test -@pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill") +@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True, False]) def test_kv_sharing_fast_prefill( monkeypatch: pytest.MonkeyPatch, + kv_sharing_fast_prefill: bool, enforce_eager: bool, test_prompts: list[str], ): @@ -79,36 +61,25 @@ def test_kv_sharing_fast_prefill( if not enforce_eager else CompilationMode.NONE, ) + batch_size = 10 with monkeypatch.context() as m: # Make scheduling deterministic for reproducibility m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") - llm = LLM( - model="google/gemma-3n-E2B-it", - enforce_eager=enforce_eager, - compilation_config=compilation_config, - seed=SEED, - ) - ref_responses = llm.generate(test_prompts, sampling_params) - - cleanup(llm, compilation_config) + prompts, answer, indices = prep_prompts(batch_size) llm = LLM( model="google/gemma-3n-E2B-it", enforce_eager=enforce_eager, compilation_config=compilation_config, seed=SEED, - kv_sharing_fast_prefill=True, + kv_sharing_fast_prefill=kv_sharing_fast_prefill, + ) + responses = llm.generate(prompts, sampling_params) + check_answers( + indices, + answer, + [response.outputs[0].text for response in responses], + accept_rate=1.0, ) - optimized_responses = llm.generate(test_prompts, sampling_params) - - cleanup(llm, compilation_config) - - misses = 0 - - for ref_response, optimized_response in zip(ref_responses, optimized_responses): - if ref_response.outputs[0].text != optimized_response.outputs[0].text: - misses += 1 - - assert misses == 0 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index fd37a665cf05..578153cda786 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -965,12 +965,6 @@ def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tens return attn_output.view(total_tokens, attn_output.shape[2], attn_output.shape[3]) -KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [ - ("logits_indices_padded", torch.Tensor | None, None), - ("num_logits_indices", int, 0), -] - - def subclass_attention_metadata( name_prefix: str, metadata_cls: Any, @@ -986,8 +980,8 @@ def subclass_attention_metadata( @runtime_checkable class KVSharingFastPrefillMetadata(Protocol): - logits_indices_padded: torch.Tensor - num_logits_indices: int + logits_indices_padded: torch.Tensor | None = None + num_logits_indices: int | None = None def create_fast_prefill_custom_backend( @@ -1019,11 +1013,6 @@ def __init__(self, metadata, common_attn_metadata): for _field in fields(metadata.__class__): setattr(self, _field.name, getattr(metadata, _field.name)) - # Set additional fields that will be used in model code - assert ( - common_attn_metadata.logits_indices_padded is not None - and common_attn_metadata.num_logits_indices is not None - ) self.logits_indices_padded = ( common_attn_metadata.logits_indices_padded ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0f7f3a501f5..341bf58f2da8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1314,7 +1314,7 @@ def _build_attention_metadata( :return: tuple[attn_metadata, spec_decode_common_attn_metadata] """ logits_indices_padded = None - num_logits_indices = 0 + num_logits_indices = None if logits_indices is not None: num_logits_indices = logits_indices.size(0) if self.cache_config.kv_sharing_fast_prefill: From db56a59970a84842da2adc3aa64e436f42448b48 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 14 Nov 2025 07:19:22 -0500 Subject: [PATCH 052/578] [BugFix] Fix FA3 IMA with FULL_AND_PIECEWISE and cascade attention (default) (#28702) --- tests/kernels/attention/test_cascade_flash_attn.py | 1 + vllm/v1/attention/backends/flash_attn.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index 4295f852f95b..20f573821b25 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -170,6 +170,7 @@ def test_cascade( logits_soft_cap=soft_cap if soft_cap is not None else 0, block_table=block_tables, common_prefix_len=common_prefix_len, + max_num_splits=0, # no max fa_version=fa_version, ) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 81623549ae85..a5d4435000d4 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -704,6 +704,7 @@ def forward( logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, common_prefix_len=attn_metadata.common_prefix_len, + max_num_splits=attn_metadata.max_num_splits, fa_version=self.vllm_flash_attn_version, prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata, suffix_scheduler_metadata=attn_metadata.scheduler_metadata, @@ -950,6 +951,7 @@ def cascade_attention( logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, + max_num_splits: int, fa_version: int, prefix_scheduler_metadata: torch.Tensor | None = None, suffix_scheduler_metadata: torch.Tensor | None = None, @@ -994,7 +996,7 @@ def cascade_attention( # s_aux is incorporated into prefix_lse inside the GPU kernel, # enabling its effect during the final attention merge. s_aux=s_aux, - num_splits=1 if vllm_is_batch_invariant() else 0, + num_splits=1 if vllm_is_batch_invariant() else max_num_splits, ) descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2]) @@ -1019,7 +1021,7 @@ def cascade_attention( q_descale=q_descale.expand(descale_shape) if q_descale is not None else None, k_descale=k_descale.expand(descale_shape) if k_descale is not None else None, v_descale=v_descale.expand(descale_shape) if v_descale is not None else None, - num_splits=1 if vllm_is_batch_invariant() else 0, + num_splits=1 if vllm_is_batch_invariant() else max_num_splits, ) # Merge prefix and suffix outputs, and store the result in output. From 8d3748d3c718dd1dfb1f7e9e0825bc9032bff75a Mon Sep 17 00:00:00 2001 From: Fasal Shah Date: Fri, 14 Nov 2025 18:13:56 +0530 Subject: [PATCH 053/578] [Doc] Fix macOS installation dependency resolution issue (#26721) Signed-off-by: faisal shah --- docs/getting_started/installation/cpu.apple.inc.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md index 7e2ed55008a5..4dc707d5f9a1 100644 --- a/docs/getting_started/installation/cpu.apple.inc.md +++ b/docs/getting_started/installation/cpu.apple.inc.md @@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -uv pip install -r requirements/cpu.txt +uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match uv pip install -e . ``` +!!! tip + The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts. + + The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation. + !!! note On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device. From 433c0f86751f20dbdfdeb1a711def99b7ae3df92 Mon Sep 17 00:00:00 2001 From: zhaozx-cn <59479021+zhaozx-cn@users.noreply.github.com> Date: Fri, 14 Nov 2025 21:33:02 +0800 Subject: [PATCH 054/578] [Model] Fix bailing_moe accuracy problem (#28277) Signed-off-by: zhaozx-cn --- vllm/model_executor/models/bailing_moe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a87813402256..6e1e5b1ddc50 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -39,7 +39,6 @@ get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import SharedFusedMoE @@ -330,7 +329,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) return final_hidden_states.view(num_tokens, hidden_size) From 96b23b8e3b5cd5d05345489a304e65f7ab53ef8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 14 Nov 2025 15:40:05 +0100 Subject: [PATCH 055/578] [Bugfix][Nixl] Fix kernel physical<>logical block_size issue (#28677) Signed-off-by: NickLucche --- tests/v1/worker/test_gpu_model_runner.py | 6 +- .../kv_connector/v1/nixl_connector.py | 67 ++++++++++++++++--- vllm/v1/worker/block_table.py | 17 +++-- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index b02d9a657407..b95c8df3469b 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -985,8 +985,10 @@ def test_hybrid_block_table_initialization(): req_index = 0 block_table.append_row(kvcache_manager_blocks, req_index) # Get expected kernel blocks from the implementation for verification. - expected_kernel_blocks = block_table._map_to_kernel_blocks( - np.array(kvcache_manager_blocks) + expected_kernel_blocks = block_table.map_to_kernel_blocks( + np.array(kvcache_manager_blocks), + block_table.blocks_per_kv_block, + block_table._kernel_block_arange, ) # Verify block table state assert block_table.num_blocks_per_row[req_index] == len(expected_kernel_blocks) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 42433c717cf2..3d4547c51453 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -49,6 +49,7 @@ from vllm.utils.network_utils import make_zmq_path, make_zmq_socket from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -112,6 +113,8 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata): @dataclass class ReqMeta: local_block_ids: list[int] + # To be used when logical block size does not match the kernel block size + local_physical_block_ids: list[int] remote_block_ids: list[int] remote_host: str remote_port: int @@ -139,6 +142,7 @@ def add_new_req( assert load_remote_cache ^ save_to_host _req = ReqMeta( local_block_ids=local_block_ids, + local_physical_block_ids=local_block_ids, remote_block_ids=kv_transfer_params["remote_block_ids"], remote_engine_id=kv_transfer_params["remote_engine_id"], remote_host=kv_transfer_params["remote_host"], @@ -935,6 +939,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): attn_backend=backend, ) self._use_pallas = self.kv_topo._use_pallas + self._physical_blocks_per_logical_kv_block = 1 def _nixl_handshake( self, @@ -1133,6 +1138,22 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): if base_addr in seen_base_addresses: continue + # TODO (NickLucche): Get kernel_block_size in a cleaner way + # NHD default "view" for non-MLA cache + kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3] + + if self.block_size != kernel_block_size: + logger.info_once( + "User-specified logical block size (%s) does not match" + " physical kernel block size (%s). Using the latter. ", + self.block_size, + kernel_block_size, + ) + self._physical_blocks_per_logical_kv_block = ( + self.block_size // kernel_block_size + ) + self.block_size = kernel_block_size + seen_base_addresses.append(base_addr) curr_tensor_size_bytes = cache.numel() * cache.element_size() @@ -1479,7 +1500,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta): assert self.use_host_buffer assert self.copy_blocks is not None - local_block_ids = meta.local_block_ids + local_block_ids = meta.local_physical_block_ids self.copy_blocks( self.host_xfer_buffers, self.device_kv_caches, @@ -1492,7 +1513,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta): "synced recved kv of request[%s] to device kv buffer," "local_block_ids: %s. ", req_id, - ",".join(map(str, meta.local_block_ids)), + ",".join(map(str, local_block_ids)), ) def save_kv_to_host(self, metadata: NixlConnectorMetadata): @@ -1501,19 +1522,22 @@ def save_kv_to_host(self, metadata: NixlConnectorMetadata): assert self.copy_blocks is not None for req_id, meta in metadata.reqs_to_save.items(): + meta.local_physical_block_ids = self._logical_to_kernel_block_ids( + meta.local_block_ids + ) if logger.isEnabledFor(logging.DEBUG): logger.debug( "save_load_kv for request[%s] to host xfer buffer." "local_block_ids: %s. ", req_id, - ",".join(map(str, meta.local_block_ids)), + ",".join(map(str, meta.local_physical_block_ids)), ) # blocking self.copy_blocks( self.device_kv_caches, self.host_xfer_buffers, - meta.local_block_ids, - meta.local_block_ids, + meta.local_physical_block_ids, + meta.local_physical_block_ids, "d2h", ) @@ -1582,7 +1606,7 @@ def get_finished(self) -> tuple[set[str], set[str]]: if self.use_host_buffer: self.sync_recved_kv_to_device(req_id, meta) if self.enable_permute_local_kv: - block_ids_to_permute += meta.local_block_ids + block_ids_to_permute += meta.local_physical_block_ids if len(block_ids_to_permute) > 0: self.permute_device_kv(block_ids_to_permute) @@ -1669,7 +1693,7 @@ def _pop_done_transfers( req_id, xfer_state, ) - # mark all blocks for this request as invalid + # mark all (logical)blocks for this request as invalid if meta := self._recving_metadata.pop(req_id, None): self._invalid_block_ids.update(meta.local_block_ids) self._recving_metadata.pop(req_id, None) @@ -1686,13 +1710,19 @@ def start_load_kv(self, metadata: NixlConnectorMetadata): We check for these trnxs to complete in each step(). """ for req_id, meta in metadata.reqs_to_recv.items(): + meta.local_physical_block_ids = self._logical_to_kernel_block_ids( + meta.local_block_ids + ) + meta.remote_block_ids = self._logical_to_kernel_block_ids( + meta.remote_block_ids + ) remote_engine_id = meta.remote_engine_id logger.debug( "start_load_kv for request %s from remote engine %s. " "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id, remote_engine_id, - len(meta.local_block_ids), + len(meta.local_physical_block_ids), len(meta.remote_block_ids), ) # always store metadata for failure recovery @@ -1740,7 +1770,7 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta): self._read_blocks( request_id=req_id, dst_engine_id=meta.remote_engine_id, - local_block_ids=meta.local_block_ids, + local_block_ids=meta.local_physical_block_ids, remote_block_ids=meta.remote_block_ids, ) @@ -1867,7 +1897,7 @@ def _read_blocks( "Marking blocks as invalid.", request_id, ) - # mark all blocks for this request as invalid + # mark all (logical) blocks for this request as invalid if meta := self._recving_metadata.get(request_id): self._invalid_block_ids.update(meta.local_block_ids) self.xfer_stats.record_failed_transfer() @@ -1906,6 +1936,23 @@ def _get_block_descs_ids( descs_ids = region_ids * num_blocks + block_ids return descs_ids.flatten() + def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: + """ + Convert logical block ids to kernel physical block ids. + This is required when the logical block size (the one set by the user) + does not match the one required by the attn backend. + """ + if self._physical_blocks_per_logical_kv_block == 1: + # Noop when physical and logical block sizes are the same + return block_ids + block_ids_np = np.array(block_ids) + block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape( + 1, -1 + ) + return BlockTable.map_to_kernel_blocks( + block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange + ).tolist() + def get_backend_aware_kv_block_len(self, layer_idx: int): """ Get the block length for one K/V element (K and V have the same size). diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index c28bf542f85c..9f6c19e46430 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -98,7 +98,9 @@ def append_row( return if self.use_hybrid_blocks: - block_ids = self._map_to_kernel_blocks(np.array(block_ids)) + block_ids = self.map_to_kernel_blocks( + np.array(block_ids), self.blocks_per_kv_block, self._kernel_block_arange + ) num_blocks = len(block_ids) start = self.num_blocks_per_row[row_idx] @@ -188,7 +190,12 @@ def clear(self) -> None: self.block_table.gpu.fill_(0) self.block_table.cpu.fill_(0) - def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray: + @staticmethod + def map_to_kernel_blocks( + kv_manager_block_ids: np.ndarray, + blocks_per_kv_block: int, + kernel_block_arange: np.ndarray, + ) -> np.ndarray: """Convert kv_manager_block_id IDs to kernel block IDs. Example: @@ -203,12 +210,12 @@ def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray: # kv_manager_block_id 1 → kernel block id [2, 3] # kv_manager_block_id 2 → kernel block id [4, 5] """ - if not self.use_hybrid_blocks: + if blocks_per_kv_block == 1: return kv_manager_block_ids kernel_block_ids = ( - kv_manager_block_ids.reshape(-1, 1) * self.blocks_per_kv_block - + self._kernel_block_arange + kv_manager_block_ids.reshape(-1, 1) * blocks_per_kv_block + + kernel_block_arange ) return kernel_block_ids.reshape(-1) From 511a6b611d2e7e6b13e09c050147b367434f1a54 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Nov 2025 22:41:02 +0800 Subject: [PATCH 056/578] [Config] Clean up SchedulerConfig initialization (#28665) Signed-off-by: DarkLight1337 --- .../models/language/generation/test_hybrid.py | 7 +- tests/v1/core/test_scheduler.py | 2 + tests/v1/sample/test_logprobs.py | 1 + vllm/config/scheduler.py | 102 +++------ vllm/engine/arg_utils.py | 208 ++++++++++++------ vllm/platforms/cpu.py | 4 +- vllm/platforms/tpu.py | 4 +- vllm/platforms/xpu.py | 4 +- vllm/utils/__init__.py | 11 +- 9 files changed, 181 insertions(+), 162 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 681b380e6a15..37830093cd3c 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -348,9 +348,14 @@ def test_fp32_cache_state( # Helper functions for the APC tests -def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1): +def _get_vllm_runner_params( + model: str, + max_model_len: int, + tensor_parallel_size: int = 1, +): return { "model_name": model, + "enable_chunked_prefill": True, "enable_prefix_caching": False, "max_model_len": max_model_len, "tensor_parallel_size": tensor_parallel_size, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index d31338220fca..287e735b5491 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -2256,6 +2256,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder( scheduler_config = SchedulerConfig( enable_chunked_prefill=enable_chunked_prefill, is_encoder_decoder=is_encoder_decoder, + # Must <= max_num_batched_tokens if chunked prefill is disabled + max_model_len=SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) # `is_encoder_decoder` should only be used during construction diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 354fff22dc2a..42584938bc06 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -47,6 +47,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]: max_num_batched_tokens=16, max_num_seqs=16, max_model_len=128, + enable_chunked_prefill=True, enforce_eager=True, # TODO: enable this once we support it for # prompt logprobs. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 71a06e167fd9..5117344a6844 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -4,7 +4,7 @@ import hashlib from collections.abc import Callable from dataclasses import InitVar -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast from pydantic import Field, field_validator, model_validator from pydantic.dataclasses import dataclass @@ -12,11 +12,6 @@ from vllm.config.utils import config from vllm.logger import init_logger -from vllm.utils import ( - DEFAULT_MAX_NUM_BATCHED_TOKENS, - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, -) from vllm.utils.import_utils import resolve_obj_by_qualname if TYPE_CHECKING: @@ -33,25 +28,32 @@ class SchedulerConfig: """Scheduler configuration.""" + DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048 + DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128 + runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int = Field(default=None, ge=1) + max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1) """Maximum number of tokens to be processed in a single iteration. - This config has no static default. If left unspecified by the user, it will - be set in `EngineArgs.create_engine_config` based on the usage context.""" + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ - max_num_seqs: int = Field(default=None, ge=1) + max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1) """Maximum number of sequences to be processed in a single iteration. - This config has no static default. If left unspecified by the user, it will - be set in `EngineArgs.create_engine_config` based on the usage context.""" + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ - max_model_len: int = Field(default=None, ge=1) - """Maximum length of a sequence (including prompt and generated text). This - is primarily set in `ModelConfig` and that value should be manually - duplicated here.""" + max_model_len: int = Field(default=8192, ge=1) + """Maximum length of a sequence (including prompt and generated text). + + The default value here is mainly for convenience when testing. + In real usage, this should duplicate `ModelConfig.max_model_len` via + `EngineArgs`.""" max_num_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of sequences that can be @@ -76,9 +78,13 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - enable_chunked_prefill: bool = Field(default=None) + enable_chunked_prefill: bool = True """If True, prefill requests can be chunked based - on the remaining max_num_batched_tokens.""" + on the remaining `max_num_batched_tokens`. + + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ is_multimodal_model: bool = False """True if the model is multimodal.""" @@ -111,9 +117,6 @@ class SchedulerConfig: - "priority" means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).""" - chunked_prefill_enabled: bool = Field(init=False) - """True if chunked prefill is enabled.""" - disable_chunked_mm_input: bool = False """If set to true and chunked prefill is enabled, we do not want to partially schedule a multimodal item. Only used in V1 @@ -188,15 +191,7 @@ def compute_hash(self) -> str: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str - @field_validator( - "max_num_batched_tokens", - "max_num_seqs", - "max_model_len", - "enable_chunked_prefill", - "scheduler_cls", - "async_scheduling", - mode="wrap", - ) + @field_validator("scheduler_cls", "async_scheduling", mode="wrap") @classmethod def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: """Skip validation if the value is `None` when initialisation is delayed.""" @@ -205,16 +200,9 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: return handler(value) def __post_init__(self, is_encoder_decoder: bool) -> None: - if self.max_model_len is None: - self.max_model_len = 8192 - - if self.max_num_seqs is None: - self.max_num_seqs = 128 - if is_encoder_decoder: # Chunked prefill should be disabled for encoder-decoder models. self.disable_chunked_mm_input = True - self.chunked_prefill_enabled = False self.enable_chunked_prefill = False self.long_prefill_token_threshold = 0 logger.info( @@ -222,37 +210,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: " prefix caching; disabling both." ) - if self.max_num_batched_tokens is None: - if self.enable_chunked_prefill: - self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS - else: - # If max_model_len is too short, use - # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value - # for higher throughput. - self.max_num_batched_tokens = max( - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS - ) - - if self.runner_type == "pooling": - # Choose specific value for higher throughput - self.max_num_batched_tokens = max( - self.max_num_batched_tokens, - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, - ) - if self.is_multimodal_model: - # The value needs to be at least the number of multimodal tokens - self.max_num_batched_tokens = max( - self.max_num_batched_tokens, - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, - ) - - # When using default settings, - # Ensure max_num_batched_tokens does not exceed model limit. - # Some models (e.g., Whisper) have embeddings tied to max length. - self.max_num_batched_tokens = min( - self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens - ) - self.max_num_encoder_input_tokens = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens @@ -262,7 +219,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: self.max_num_batched_tokens, ) - self.chunked_prefill_enabled = self.enable_chunked_prefill if self.max_num_partial_prefills > 1: if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) @@ -276,6 +232,14 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: self.long_prefill_token_threshold, ) + @property + def chunked_prefill_enabled(self) -> bool: + return self.enable_chunked_prefill + + @chunked_prefill_enabled.setter + def chunked_prefill_enabled(self, value: bool): + self.enable_chunked_prefill = value + @model_validator(mode="after") def _verify_args(self) -> Self: if ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b025004ea022..cacebc530b6e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -428,11 +428,11 @@ class EngineArgs: cpu_offload_gb: float = CacheConfig.cpu_offload_gb gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes - max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens + max_num_batched_tokens: int | None = None max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold - max_num_seqs: int | None = SchedulerConfig.max_num_seqs + max_num_seqs: int | None = None max_logprobs: int = ModelConfig.max_logprobs logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode disable_log_stats: bool = False @@ -485,7 +485,7 @@ class EngineArgs: model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns") - enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill + enable_chunked_prefill: bool | None = None disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input disable_hybrid_kv_cache_manager: bool = ( @@ -1738,41 +1738,41 @@ def _check_feature_supported(self, model_config: ModelConfig): ) _raise_unsupported_error(feature_name=name) - def _set_default_args( - self, usage_context: UsageContext, model_config: ModelConfig - ) -> None: - """Set Default Arguments for V1 Engine.""" - - # V1 uses chunked prefills and prefix caching by default - # for non-pooling tasks. - # For pooling tasks the default is False + @classmethod + def get_chunked_prefill_prefix_caching_defaults( + cls, + model_config: ModelConfig, + ) -> tuple[bool, bool]: if model_config.runner_type != "pooling": - self.enable_chunked_prefill = True - - if self.enable_prefix_caching is None: - # Disable prefix caching default for hybrid models - # since the feature is still experimental. - if model_config.is_hybrid: - self.enable_prefix_caching = False - else: - self.enable_prefix_caching = True + default_chunked_prefill = True + + # Disable prefix caching default for hybrid models + # since the feature is still experimental. + default_prefix_caching = not model_config.is_hybrid else: + assert model_config.pooler_config is not None + pooling_type = model_config.pooler_config.pooling_type - is_causal = getattr(model_config.hf_config, "is_causal", True) incremental_prefill_supported = ( pooling_type is not None and pooling_type.lower() == "last" - and bool(is_causal) + and getattr(model_config.hf_config, "is_causal", True) ) - action = "Enabling" if incremental_prefill_supported else "Disabling" + default_chunked_prefill = incremental_prefill_supported + default_prefix_caching = incremental_prefill_supported + + return default_chunked_prefill, default_prefix_caching + + @classmethod + def get_batch_defaults( + cls, + world_size: int, + ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]: + from vllm.usage.usage_lib import UsageContext - if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = incremental_prefill_supported - logger.info("(%s) chunked prefill by default", action) - if self.enable_prefix_caching is None: - self.enable_prefix_caching = incremental_prefill_supported - logger.info("(%s) prefix caching by default", action) + default_max_num_batched_tokens: dict[UsageContext | None, int] + default_max_num_seqs: dict[UsageContext | None, int] # When no user override, set the default values based on the usage # context. @@ -1793,8 +1793,6 @@ def _set_default_args( # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces # throughput, see PR #17885 for more details. # So here we do an extra device name check to prevent such regression. - from vllm.usage.usage_lib import UsageContext - if device_memory >= 70 * GiB_bytes and "a100" not in device_name: # For GPUs like H100 and MI300x, use larger default values. default_max_num_batched_tokens = { @@ -1818,22 +1816,26 @@ def _set_default_args( # tpu specific default values. if current_platform.is_tpu(): - default_max_num_batched_tokens_tpu = { - UsageContext.LLM_CLASS: { - "V6E": 2048, - "V5E": 1024, - "V5P": 512, - }, - UsageContext.OPENAI_API_SERVER: { - "V6E": 1024, - "V5E": 512, - "V5P": 256, - }, - } + chip_name = current_platform.get_device_name() + + if chip_name == "V6E": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 2048, + UsageContext.OPENAI_API_SERVER: 1024, + } + elif chip_name == "V5E": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 1024, + UsageContext.OPENAI_API_SERVER: 512, + } + elif chip_name == "V5P": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 512, + UsageContext.OPENAI_API_SERVER: 256, + } # cpu specific default values. if current_platform.is_cpu(): - world_size = self.pipeline_parallel_size * self.tensor_parallel_size default_max_num_batched_tokens = { UsageContext.LLM_CLASS: 4096 * world_size, UsageContext.OPENAI_API_SERVER: 2048 * world_size, @@ -1843,44 +1845,104 @@ def _set_default_args( UsageContext.OPENAI_API_SERVER: 128 * world_size, } - use_context_value = usage_context.value if usage_context else None - if ( - self.max_num_batched_tokens is None - and usage_context in default_max_num_batched_tokens + return default_max_num_batched_tokens, default_max_num_seqs + + def _set_default_args( + self, usage_context: UsageContext, model_config: ModelConfig + ) -> None: + """Set Default Arguments for V1 Engine.""" + ( + default_chunked_prefill, + default_prefix_caching, + ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + + if self.enable_chunked_prefill is None: + self.enable_chunked_prefill = default_chunked_prefill + + logger.debug( + "%s chunked prefill by default", + "Enabling" if default_chunked_prefill else "Disabling", + ) + elif ( + model_config.runner_type == "pooling" + and self.enable_chunked_prefill + and not default_chunked_prefill ): - if current_platform.is_tpu(): - chip_name = current_platform.get_device_name() - if chip_name in default_max_num_batched_tokens_tpu[usage_context]: - self.max_num_batched_tokens = default_max_num_batched_tokens_tpu[ - usage_context - ][chip_name] - else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context - ] - else: - if not self.enable_chunked_prefill: - self.max_num_batched_tokens = model_config.max_model_len - else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context - ] + logger.warning( + "This model does not officially support chunked prefill. " + "Enabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + ) + + if self.enable_prefix_caching is None: + self.enable_prefix_caching = default_prefix_caching + logger.debug( - "Setting max_num_batched_tokens to %d for %s usage context.", + "%s prefix caching by default", + "Enabling" if default_prefix_caching else "Disabling", + ) + elif ( + model_config.runner_type == "pooling" + and self.enable_prefix_caching + and not default_prefix_caching + ): + logger.warning( + "This model does not officially support prefix caching. " + "Enabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + ) + + world_size = self.pipeline_parallel_size * self.tensor_parallel_size + ( + default_max_num_batched_tokens, + default_max_num_seqs, + ) = self.get_batch_defaults(world_size) + + orig_max_num_batched_tokens = self.max_num_batched_tokens + orig_max_num_seqs = self.max_num_seqs + + if self.max_num_batched_tokens is None: + self.max_num_batched_tokens = default_max_num_batched_tokens.get( + usage_context, + SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS, + ) + + if self.max_num_seqs is None: + self.max_num_seqs = default_max_num_seqs.get( + usage_context, + SchedulerConfig.DEFAULT_MAX_NUM_SEQS, + ) + + if orig_max_num_batched_tokens is None: + if not self.enable_chunked_prefill: + # If max_model_len is too short, use the default for higher throughput. + self.max_num_batched_tokens = max( + model_config.max_model_len, + self.max_num_batched_tokens, + ) + + # When using default settings, + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. + self.max_num_batched_tokens = min( + self.max_num_seqs * model_config.max_model_len, self.max_num_batched_tokens, - use_context_value, ) - if self.max_num_seqs is None and usage_context in default_max_num_seqs: - self.max_num_seqs = min( - default_max_num_seqs[usage_context], - self.max_num_batched_tokens or sys.maxsize, + logger.debug( + "Defaulting max_num_batched_tokens to %d for %s usage context.", + self.max_num_batched_tokens, + usage_context.value if usage_context else None, ) + if orig_max_num_seqs is None: + assert self.max_num_batched_tokens is not None # For type checking + self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens) + logger.debug( - "Setting max_num_seqs to %d for %s usage context.", + "Defaulting max_num_seqs to %d for %s usage context.", self.max_num_seqs, - use_context_value, + usage_context.value if usage_context else None, ) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cf954768689f..fdfa1c19789c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -15,7 +15,6 @@ from vllm import envs from vllm.logger import init_logger -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import CpuArchEnum, Platform, PlatformEnum @@ -339,10 +338,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "prefill and prefix caching to be disabled." ) vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False vllm_config.scheduler_config.max_num_batched_tokens = max( vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS, + vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) @classmethod diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index b997bb9e6999..4ab037fdb77e 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -10,7 +10,6 @@ from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger from vllm.sampling_params import SamplingParams, SamplingType -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum @@ -186,10 +185,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "prefill and prefix caching to be disabled." ) vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False vllm_config.scheduler_config.max_num_batched_tokens = max( vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS, + vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) @classmethod diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 5552e4ca4b2f..ad4beb28bdae 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -9,7 +9,6 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import DeviceCapability, Platform, PlatformEnum @@ -185,10 +184,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "prefill and prefix caching to be disabled." ) vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False vllm_config.scheduler_config.max_num_batched_tokens = max( vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS, + vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) @classmethod diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 040c0416c5ea..3ef44e770320 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3,7 +3,7 @@ import uuid import warnings -from typing import Any, TypeVar +from typing import Any import torch @@ -39,12 +39,6 @@ def __dir__() -> list[str]: logger = init_logger(__name__) -# This value is chosen to have a balance between ITL and TTFT. Note it is -# not optimized for throughput. -DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 -POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 -MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 - # Constants related to forcing the attention backend selection # String name of register which may be set in order to @@ -60,9 +54,6 @@ def __dir__() -> list[str]: STR_INVALID_VAL: str = "INVALID" -T = TypeVar("T") - - def random_uuid() -> str: return str(uuid.uuid4().hex) From 3f8a8740656f2c0079b9e2b1623a0758a61104af Mon Sep 17 00:00:00 2001 From: Duncan Moss Date: Fri, 14 Nov 2025 08:02:44 -0800 Subject: [PATCH 057/578] [Kernels] Enable FlashInfer FP8 Blockscale on SM90 (for TEP DSR1) (#27134) Signed-off-by: Duncan Moss Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../fused_moe/flashinfer_cutlass_moe.py | 23 ++- .../flashinfer_cutlass_prepare_finalize.py | 147 ++++++++++++------ .../model_executor/layers/quantization/fp8.py | 48 ++++-- .../quantization/utils/flashinfer_utils.py | 29 +++- 4 files changed, 179 insertions(+), 68 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 85ce77fb1f7f..943695f921ad 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -57,6 +57,7 @@ def __init__( tp_rank: int = 0, tp_size: int = 1, use_dp: bool = False, + use_deepseek_fp8_block_scale: bool = False, ): super().__init__(quant_config) assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), ( @@ -69,6 +70,10 @@ def __init__( self.tp_size = tp_size self.out_dtype = out_dtype self.use_dp = use_dp + # Enables DeepSeek-style FP8 block-scale path: + # - pass per-block weight scales to the kernel + # - skip input activation quantization (kernel applies scaling) + self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale @property def activation_formats( @@ -147,7 +152,12 @@ def apply( "Only activation silu is supported in FlashInferExperts" ) - if self.quant_dtype == torch.float8_e4m3fn: + # Select quantization metadata based on FP8 format/path + if ( + self.quant_dtype == torch.float8_e4m3fn + and not self.use_deepseek_fp8_block_scale + ): + # FP8 per-tensor path: use global alphas/scales; do not pass input_sf quant_scales = [ self.g1_alphas, self.a2_gscale, @@ -176,6 +186,15 @@ def apply( # FlashInfer API requires weight to be long for nvfp4 fc1_expert_weights = w1.view(torch.long) fc2_expert_weights = w2.view(torch.long) + elif self.use_deepseek_fp8_block_scale: + # FP8 block-scale path: provide block-scale weights, omit a1q_scale + quant_scales = [ + self.w1_scale, + self.w2_scale, + ] + a1q_scale = None + fc1_expert_weights = w1 + fc2_expert_weights = w2 else: quant_scales = None a1q_scale = None @@ -196,6 +215,8 @@ def apply( ep_size=self.ep_size, ep_rank=self.ep_rank, output=output, + # Informs FlashInfer to use the block-scale decoding path when True + use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index bc9aab5208d9..762890867e60 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -28,11 +28,15 @@ def __init__( self, use_dp: bool, num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, ): super().__init__() self.num_dispatchers_ = num_dispatchers self.use_dp = use_dp self.local_tokens = None + # Toggle for DeepSeek-style FP8 block-scale path where activations are + # not quantized here and weight block scales are consumed by the kernel. + self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -73,8 +77,9 @@ def __init__( self, use_dp: bool, num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, ): - super().__init__(use_dp, num_dispatchers) + super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale) self.alltoall_info = None # Initialize all2all_manager only for DP case @@ -97,15 +102,19 @@ def prepare( ) if not self.use_dp: - # Non-DP case: standard quantization - a1q, a1q_scale = moe_kernel_quantize_input( - a1, - quant_config.a1_gscale, - quant_config.quant_dtype, - quant_config.per_act_token_quant, - quant_config.block_shape, - is_fp4_scale_swizzled=not self.use_dp, - ) + # Non-DP case: quantize activations unless using block-scale path + if not self.use_deepseek_fp8_block_scale: + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_gscale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=not self.use_dp, + ) + else: + a1q = a1 + a1q_scale = None else: # DP case: use FlashInfer AllToAll global_num_tokens_cpu = get_local_sizes() @@ -122,6 +131,7 @@ def prepare( top_k, num_experts, quant_config, + use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, ) ) @@ -154,8 +164,9 @@ def __init__( self, use_dp: bool, num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, ): - super().__init__(use_dp, num_dispatchers) + super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale) def prepare( self, @@ -173,22 +184,42 @@ def prepare( if not self.use_dp and quant_config.quant_dtype == "nvfp4": return a1, None, None, topk_ids, topk_weights - a1q, a1q_scale = moe_kernel_quantize_input( - a1, - quant_config.a1_gscale, - quant_config.quant_dtype, - quant_config.per_act_token_quant, - quant_config.block_shape, - is_fp4_scale_swizzled=not self.use_dp, - ) + if not self.use_deepseek_fp8_block_scale: + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_gscale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=not self.use_dp, + ) + else: + # Block-scale path: pass activations through, omit per-token scales + a1q = a1 + a1q_scale = None if self.use_dp: - topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv( - [topk_weights, topk_ids, a1q, a1q_scale], - dim=0, - sizes=get_local_sizes(), - ) - if quant_config.quant_dtype == "nvfp4": + # Build gather list conditionally - omit a1q_scale if None + # (block-scale path) + gather_list = [topk_weights, topk_ids, a1q] + if a1q_scale is not None: + gather_list.append(a1q_scale) + gathered = get_dp_group().all_gatherv( + gather_list, + dim=0, + sizes=get_local_sizes(), + ) + topk_weights, topk_ids, a1q, a1q_scale = gathered + else: + gathered = get_dp_group().all_gatherv( + gather_list, + dim=0, + sizes=get_local_sizes(), + ) + topk_weights, topk_ids, a1q = gathered + a1q_scale = None + + if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None: a1q_scale = nvfp4_block_scale_interleave(a1q_scale) return a1q, a1q_scale, None, topk_ids, topk_weights @@ -221,6 +252,7 @@ def flashinfer_alltoall_dispatch( top_k: int, num_experts: int, quant_config: FusedMoEQuantConfig, + use_deepseek_fp8_block_scale: bool = False, ): from flashinfer.comm.trtllm_alltoall import MnnvlMoe @@ -250,30 +282,42 @@ def flashinfer_alltoall_dispatch( ) topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype) - x, x_sf = moe_kernel_quantize_input( - x, - gs, - quant_config.quant_dtype, - quant_config.per_act_token_quant, - quant_config.block_shape, - is_fp4_scale_swizzled=False, # delay swizzle to after comm - ) - x = MnnvlMoe.mnnvl_moe_alltoallv( - x, - alltoall_info, - all2all_manager.workspace_tensor, - ep_rank, - ep_size, - ) + if not use_deepseek_fp8_block_scale: + x, x_sf = moe_kernel_quantize_input( + x, + gs, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=False, # delay swizzle to after comm + ) + x = MnnvlMoe.mnnvl_moe_alltoallv( + x, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) - x_sf = MnnvlMoe.mnnvl_moe_alltoallv( - x_sf, - alltoall_info, - all2all_manager.workspace_tensor, - ep_rank, - ep_size, - ) - x_sf = nvfp4_block_scale_interleave(x_sf) + x_sf = MnnvlMoe.mnnvl_moe_alltoallv( + x_sf, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) + if quant_config.quant_dtype == "nvfp4": + x_sf = nvfp4_block_scale_interleave(x_sf) + else: + # Block-scale path: pass activations through without quantization + x_sf = None + x = MnnvlMoe.mnnvl_moe_alltoallv( + x, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) return alltoall_info, topk_ids, topk_weights, x, x_sf @@ -304,6 +348,7 @@ def create_flashinfer_prepare_finalize( use_dp: bool, use_nvfp4: bool = False, enable_alltoallv: bool = False, + use_deepseek_fp8_block_scale: bool = False, ) -> FlashInferCutlassMoEPrepareAndFinalize: """Factory function to create the appropriate FlashInfer implementation.""" if use_nvfp4: @@ -311,5 +356,7 @@ def create_flashinfer_prepare_finalize( return FlashInferAllToAllMoEPrepareAndFinalize(use_dp) else: return FlashInferAllGatherMoEPrepareAndFinalize(use_dp) - # Fp8 only supports AllGather - return FlashInferAllGatherMoEPrepareAndFinalize(use_dp) + # FP8 path currently supported via AllGather; optionally enable block-scale + return FlashInferAllGatherMoEPrepareAndFinalize( + use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index bbd0a4df1048..0479bec33840 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -3,6 +3,7 @@ from collections.abc import Callable from enum import Enum +from functools import partial from typing import TYPE_CHECKING, Any, Optional import torch @@ -122,10 +123,13 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. """ - # prefer FlashInfer backends when available and enabled on supported GPUs + # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. if ( current_platform.is_cuda() - and current_platform.is_device_capability(100) + and ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(90) + ) and envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe() ): @@ -134,14 +138,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100") return Fp8MoeBackend.FLASHINFER_TRTLLM else: - if block_quant: + if block_quant and current_platform.is_device_capability(100): raise ValueError( "FlashInfer FP8 MoE throughput backend does not " "support block quantization. Please use " "VLLM_FLASHINFER_MOE_BACKEND=latency " "instead." ) - logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100") + logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100") return Fp8MoeBackend.FLASHINFER_CUTLASS # weight-only path for older GPUs without native FP8 @@ -641,6 +645,16 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS + if self.block_quant: + assert self.weight_block_size == [128, 128], ( + f"Only support weight_block_size == [128, 128], " + f"got {self.weight_block_size}" + ) + self.flashinfer_moe_fn = partial( + flashinfer_cutlass_moe_fp8, + moe=self.moe, + use_deepseek_fp8_block_scale=self.block_quant, + ) self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM self.allow_cutlass_block_scaled_grouped_gemm = ( @@ -1012,8 +1026,15 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: ): return None elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + if self.block_quant: + assert self.weight_block_size == [128, 128], ( + f"Only support weight_block_size == [128, 128], " + f"got {self.weight_block_size}" + ) + # Wire block-scale flag through prepare/finalize when using CUTLASS prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( - self.moe + self.moe, + use_deepseek_fp8_block_scale=self.block_quant, ) logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize @@ -1062,9 +1083,11 @@ def select_gemm_impl( ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + # Select GEMM experts with block-scale when weights are block-quantized experts = select_cutlass_fp8_gemm_impl( self.moe, self.moe_quant_config, + use_deepseek_fp8_block_scale=self.block_quant, ) logger.debug_once("Using %s", experts.__class__.__name__) return experts @@ -1251,16 +1274,17 @@ def apply( workspace=layer.workspace, ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert not self.block_quant - assert not renormalize and custom_routing_function is not None assert activation == "silu", ( f"Expected 'silu' activation but got {activation}" ) - assert scoring_func == "sigmoid", ( - f"Expected 'sigmoid' scoring func but got {scoring_func}" - ) - - result = flashinfer_cutlass_moe_fp8( + if not self.block_quant: + assert not renormalize and custom_routing_function is not None + assert scoring_func == "sigmoid", ( + f"Expected 'sigmoid' scoring func but got {scoring_func}" + ) + # Delegate to CUTLASS FlashInfer path; function already bound with + # use_deepseek_fp8_block_scale for block-quant when applicable + result = self.flashinfer_moe_fn( x, layer, topk_weights, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index e49d374f154d..d9e9b4240271 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 create_flashinfer_prepare_finalize, ) +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -190,17 +191,22 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None: def build_flashinfer_fp8_cutlass_moe_prepare_finalize( - moe: FusedMoEConfig | None, + moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False ) -> mk.FusedMoEPrepareAndFinalize: """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False - return create_flashinfer_prepare_finalize(use_dp) + # Propagate block-scale flag so prepare/finalize can skip act quantization + # and inform the kernel to consume per-block weight scales. + return create_flashinfer_prepare_finalize( + use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ) def select_cutlass_fp8_gemm_impl( moe: FusedMoEConfig | None, quant_config: FusedMoEQuantConfig, out_dtype: torch.dtype | None = None, + use_deepseek_fp8_block_scale: bool = False, ) -> mk.FusedMoEPermuteExpertsUnpermute: """Return a GEMM *experts* implementation for fused-MoE layers""" @@ -212,12 +218,14 @@ def select_cutlass_fp8_gemm_impl( ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, tp_size=moe.moe_parallel_config.tp_size, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, ) assert out_dtype is not None, "If moe config is None, out_dtype must be passed" return FlashInferExperts( out_dtype=out_dtype, quant_config=quant_config, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, ) @@ -231,14 +239,22 @@ def flashinfer_cutlass_moe_fp8( global_num_experts: int = -1, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, + use_deepseek_fp8_block_scale: bool = False, + moe: FusedMoEConfig | None = None, ) -> torch.Tensor: quant_config = layer.quant_method.get_fused_moe_quant_config(layer) assert quant_config is not None + # Construct modular kernel with block-scale support when requested. fused_experts = mk.FusedMoEModularKernel( - build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None), + build_flashinfer_fp8_cutlass_moe_prepare_finalize( + moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ), select_cutlass_fp8_gemm_impl( - moe=None, quant_config=quant_config, out_dtype=hidden_states.dtype + moe=moe, + quant_config=quant_config, + out_dtype=hidden_states.dtype, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, ), ) @@ -258,7 +274,10 @@ def flashinfer_cutlass_moe_fp8( def get_flashinfer_moe_backend() -> FlashinferMoeBackend: flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND - if flashinfer_moe_backend == "throughput": + # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations + if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability( + 90 + ): return FlashinferMoeBackend.CUTLASS elif flashinfer_moe_backend == "latency": return FlashinferMoeBackend.TENSORRT_LLM From c934caee88f65258aac00d71d9ae0ecc4a4e1cd7 Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Sat, 15 Nov 2025 00:07:20 +0800 Subject: [PATCH 058/578] [Fix] improve aspect ratio in dummy image generation and add common VLM tests for PaddleOCR-VL (#28711) Signed-off-by: dongbo910220 <1275604947@qq.com> --- .../multimodal/generation/test_common.py | 18 ++++++++++++++++++ vllm/model_executor/models/paddleocr_vl.py | 3 +-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 22083d9f1614..95b64b380db0 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -12,6 +12,7 @@ from packaging.version import Version from transformers import ( AutoModel, + AutoModelForCausalLM, AutoModelForImageTextToText, AutoModelForTextToWaveform, ) @@ -691,6 +692,23 @@ patch_hf_runner=model_utils.ovis2_5_patch_hf_runner, hf_model_kwargs={"revision": "refs/pr/5"}, ), + "paddleocr_vl": VLMTestInfo( + models=["PaddlePaddle/PaddleOCR-VL"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", + img_idx_to_prompt=lambda idx: ( + "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" + ), + multi_image_prompt=( + "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n" + "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n" + "Describe these two images separately." + ), + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForCausalLM, + image_size_factors=[(), (0.25,)], + ), "phi3v": VLMTestInfo( models=["microsoft/Phi-3.5-vision-instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 183f458658aa..3ef6470070d1 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -232,8 +232,7 @@ def get_image_size_with_most_features(self) -> ImageSize: # Find factors of max_num_tokens close to its square root # to create a dummy image with a reasonable aspect ratio. h_patches = int(math.sqrt(max_num_tokens)) - while max_num_tokens % h_patches != 0: - h_patches -= 1 + max_num_tokens -= max_num_tokens % h_patches w_patches = max_num_tokens // h_patches return ImageSize(height=h_patches * factor, width=w_patches * factor) From 5f3cd7f7f20a8e4445d70cbd1f5475175ef391e3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:34:14 +0000 Subject: [PATCH 059/578] [Docs] Update the name of `Transformers backend` -> `Transformers modeling backend` (#28725) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 2 +- docs/contributing/model/README.md | 2 +- .../frameworks/hf_inference_endpoints.md | 4 +-- docs/models/supported_models.md | 26 +++++++++---------- tests/models/test_transformers.py | 4 +-- vllm/config/model.py | 8 +++--- vllm/lora/layers/base_linear.py | 2 +- vllm/model_executor/models/adapters.py | 4 +-- .../models/transformers/__init__.py | 4 +-- .../models/transformers/base.py | 9 ++++--- .../models/transformers/causal.py | 2 +- .../models/transformers/legacy.py | 2 +- .../model_executor/models/transformers/moe.py | 4 +-- .../models/transformers/multimodal.py | 12 +++++---- .../models/transformers/pooling.py | 2 +- .../models/transformers/utils.py | 2 +- 16 files changed, 46 insertions(+), 43 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bfb0e91fd06e..6e178bb690c5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC -# Transformers backend +# Transformers modeling backend /vllm/model_executor/models/transformers @hmellor /tests/models/test_transformers.py @hmellor diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md index d8c40c519573..13f3edb7e1af 100644 --- a/docs/contributing/model/README.md +++ b/docs/contributing/model/README.md @@ -1,7 +1,7 @@ # Summary !!! important - Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve ` works first! + Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve ` works first! vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance. diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md index d39bb9a899c8..05df0dacd8f1 100644 --- a/docs/deployment/frameworks/hf_inference_endpoints.md +++ b/docs/deployment/frameworks/hf_inference_endpoints.md @@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o ## Advanced Deployment Details -With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications. +With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications. Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts. @@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de - Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog - Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index) - Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm) -- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html) +- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c1eb207efcd1..0439e9cf2364 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang ### Transformers -vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend". +vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend". -Currently, the Transformers backend works for the following: +Currently, the Transformers modeling backend works for the following: - Modalities: embedding models, language models and vision-language models* - Architectures: encoder-only, decoder-only, mixture-of-experts @@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following: _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._ -If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM: +If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM: - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature) - Any combination of the following vLLM parallelisation schemes: @@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model))) If the printed type starts with `Transformers...` then it's using the Transformers model implementation! -If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). +If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). !!! note For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. @@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM! -For a model to be compatible with the Transformers backend for vLLM it must: +For a model to be compatible with the Transformers modeling backend for vLLM it must: - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): - The model directory must have the correct structure (e.g. `config.json` is present). - `config.json` must contain `auto_map.AutoModel`. -- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)): +- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)): - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). If the compatible model is: @@ -66,13 +66,13 @@ If the compatible model is: - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md). - in a local directory, simply pass directory path to `model=` for [offline-inference](../serving/offline_inference.md) or `vllm serve ` for the [openai-compatible-server](../serving/openai_compatible_server.md). -This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! +This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! #### Writing custom models -This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). +This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). -To make your model compatible with the Transformers backend, it needs: +To make your model compatible with the Transformers modeling backend, it needs: 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. - If your model is encoder-only: @@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! @@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. Models do not _need_ to be natively supported to be used in vLLM. -The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). +The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). !!! tip The easiest way to check if your model is really supported at runtime is to run the program below: @@ -451,7 +451,7 @@ th { | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ | -Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! +Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|-------------------|----------------------|---------------------------| @@ -720,7 +720,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | | `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | -Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! +Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------| diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index a18f5b607763..ae5befd2c00b 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test the functionality of the Transformers backend.""" +"""Test the functionality of the Transformers modeling backend.""" from typing import Any @@ -85,7 +85,7 @@ def test_models( required = Version("5.0.0.dev") if model == "allenai/OLMoE-1B-7B-0924" and installed < required: pytest.skip( - "MoE models with the Transformers backend require " + "MoE models with the Transformers modeling backend require " f"transformers>={required}, but got {installed}" ) diff --git a/vllm/config/model.py b/vllm/config/model.py index 8ec66b6b3160..b3a28af6de38 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -732,7 +732,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": return self def _get_transformers_backend_cls(self) -> str: - """Determine which Transformers backend class will be used if + """Determine which Transformers modeling backend class will be used if `model_impl` is set to `transformers` or `auto`.""" cls = "Transformers" # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal @@ -746,8 +746,8 @@ def _get_transformers_backend_cls(self) -> str: # User specified value take precedence if self.runner != "auto": runner = self.runner - # Only consider Transformers backend pooling classes if we're wrapping an - # architecture that defaults to pooling. Otherwise, we return the LM class + # Only consider Transformers modeling backend pooling classes if we're wrapping + # an architecture that defaults to pooling. Otherwise, we return the LM class # and use adapters. if runner == "pooling" and task in {"embed", "classify"}: if task == "embed": @@ -759,7 +759,7 @@ def _get_transformers_backend_cls(self) -> str: return cls def using_transformers_backend(self) -> bool: - """Check if the model is using the Transformers backend class.""" + """Check if the model is using the Transformers modeling backend class.""" used_cls = self._model_info.architecture transformers_backend_cls = self._get_transformers_backend_cls() return used_cls == transformers_backend_cls diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index d619a0edc124..3db4165e2017 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -121,7 +121,7 @@ def set_lora( def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # In transformers backend, x and output have extra batch dimension like + # In Transformers modeling backend, x and output have extra batch dimension like # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), # therefore we need to flatten the batch dimensions. if x.ndim == 3 and output.ndim == 3: diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index f742090df71f..a9cc49451a1d 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -429,7 +429,7 @@ def load_weights_using_from_2_way_softmax( if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not # have this attribute, we fallback to get_input_embeddings(), which is used by - # the Transformers backend. + # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") @@ -487,7 +487,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not # have this attribute, we fallback to get_input_embeddings(), which is used by - # the Transformers backend. + # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py index 365b5eb08893..93cd8ff50766 100644 --- a/vllm/model_executor/models/transformers/__init__.py +++ b/vllm/model_executor/models/transformers/__init__.py @@ -120,8 +120,8 @@ def __getattr__(name: str): """Handle imports of non-existent classes with a helpful error message.""" if name not in globals(): raise AttributeError( - "The Transformers backend does not currently have a class to handle " - f"the requested model type: {name}. Please open an issue at " + "The Transformers modeling backend does not currently have a class to " + f"handle the requested model type: {name}. Please open an issue at " "https://github.com/vllm-project/vllm/issues/new" ) return globals()[name] diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 63096e57f8ee..f4ba4758bcc4 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend base class.""" +"""Transformers modeling backend base class.""" from collections.abc import Iterable from typing import TYPE_CHECKING @@ -118,7 +118,7 @@ def __init_subclass__(cls, *args, **kwargs): def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): super().__init__() - logger.info("Using Transformers backend.") + logger.info("Using Transformers modeling backend.") self.config = vllm_config.model_config.hf_config self.text_config = self.config.get_text_config() @@ -147,7 +147,8 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): # Check for unsupported quantization methods. if quant_method_name == "mxfp4": raise NotImplementedError( - "Transformers backend does not support MXFP4 quantization yet." + "Transformers modeling backend does " + "not support MXFP4 quantization yet." ) # Skip loading extra bias for GPTQ models. if "gptq" in quant_method_name: @@ -458,6 +459,6 @@ def check_version(min_version: str, feature: str): required = Version(min_version) if installed < required: raise ImportError( - f"Transformers backend requires transformers>={required} " + f"Transformers modeling backend requires transformers>={required} " f"for {feature}, but got {installed}" ) diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py index 42fd11117c73..b2865ed0c7ff 100644 --- a/vllm/model_executor/models/transformers/causal.py +++ b/vllm/model_executor/models/transformers/causal.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for causal language models.""" +"""Transformers modeling backend mixin for causal language models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py index a453870a2687..aca630be5615 100644 --- a/vllm/model_executor/models/transformers/legacy.py +++ b/vllm/model_executor/models/transformers/legacy.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for legacy models.""" +"""Transformers modeling backend mixin for legacy models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 8e39eb0b9902..4973014c3d4e 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for Mixture of Experts (MoE) models.""" +"""Transformers modeling backend mixin for Mixture of Experts (MoE) models.""" from typing import TYPE_CHECKING, Any @@ -39,7 +39,7 @@ @CustomOp.register("transformers_fused_moe") class TransformersFusedMoE(FusedMoE): - """Custom FusedMoE for the Transformers backend.""" + """Custom FusedMoE for the Transformers modeling backend.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 9b0463f41fa8..ccf605371987 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for multi-modal models.""" +"""Transformers modeling backend mixin for multi-modal models.""" from collections.abc import Mapping from typing import TYPE_CHECKING @@ -310,9 +310,9 @@ def forward( return model_output def get_language_model(self) -> torch.nn.Module: - """Transformers backend multimodal classes do not contain a separate vLLM - language model class. Therefore, in order to return a language model vLLM class, - we use a wrapper to give `self` the same interface as a text model.""" + """Transformers modeling backend multimodal classes do not contain a separate + vLLM language model class. Therefore, in order to return a language model vLLM + class, we use a wrapper to give `self` the same interface as a text model.""" # Exclude self and object bases = self.__class__.mro()[1:-1] @@ -385,7 +385,9 @@ def get_mrope_input_positions( for k, v in kwargs.items() if k not in {"image_grid_thw", "video_grid_thw"} ): - raise NotImplementedError("Transformers backend only supports images.") + raise NotImplementedError( + "Transformers modeling backend only supports images." + ) image_grid_thw = kwargs.get("image_grid_thw", []) video_grid_thw = kwargs.get("video_grid_thw", []) diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py index 8117bbac013e..4c2a74bccb6a 100644 --- a/vllm/model_executor/models/transformers/pooling.py +++ b/vllm/model_executor/models/transformers/pooling.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixins for pooling models.""" +"""Transformers modeling backend mixins for pooling models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 267a6e06e6bb..517eb54d53ac 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend utilities.""" +"""Transformers modeling backend utilities.""" from contextlib import contextmanager from pathlib import Path From d54a18a47e7cb6a126a022914c7965f84e15217c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 14 Nov 2025 11:37:18 -0500 Subject: [PATCH 060/578] [CI][CPU] Smoke test for Apple Silicon using GHA MacOS runner (#28688) Signed-off-by: mgoin --- .github/workflows/macos-smoke-test.yml | 73 ++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/macos-smoke-test.yml diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml new file mode 100644 index 000000000000..f56fdc0dbe79 --- /dev/null +++ b/.github/workflows/macos-smoke-test.yml @@ -0,0 +1,73 @@ +name: macOS Apple Silicon Smoke Test + +on: + workflow_dispatch: # Manual trigger + +jobs: + macos-m1-smoke-test: + runs-on: macos-latest + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + + - uses: astral-sh/setup-uv@v4 + with: + enable-cache: true + python-version: '3.12' + + - name: Install dependencies + run: | + uv pip install -r requirements/cpu-build.txt + uv pip install -r requirements/cpu.txt + + - name: Build vLLM + run: uv pip install -v -e . + env: + CMAKE_BUILD_PARALLEL_LEVEL: 4 + + - name: Verify installation + run: | + python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" + python -c "import torch; print(f'PyTorch: {torch.__version__}')" + + - name: Smoke test vllm serve + timeout-minutes: 10 + run: | + # Start server in background + vllm serve Qwen/Qwen3-0.6B \ + --max-model-len=2048 \ + --load-format=dummy \ + --enforce-eager \ + --port 8000 & + + SERVER_PID=$! + + # Wait for server to start + for i in {1..30}; do + if curl -s http://localhost:8000/health > /dev/null; then + echo "Server started successfully" + break + fi + if [ "$i" -eq 30 ]; then + echo "Server failed to start" + kill "$SERVER_PID" + exit 1 + fi + sleep 2 + done + + # Test health endpoint + curl -f http://localhost:8000/health + + # Test completion + curl -f http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "Hello", + "max_tokens": 5 + }' + + # Cleanup + kill "$SERVER_PID" From 6f1e7f7226447f606a0731376a2d0bd080aa2767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 14 Nov 2025 17:58:01 +0100 Subject: [PATCH 061/578] [DisaggEverything] Tokens in<>out `/generate` endpoint (#24261) Signed-off-by: NickLucche Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../online_serving/token_generation_client.py | 49 ++++ requirements/docs.txt | 4 + .../entrypoints/openai/test_serving_tokens.py | 262 +++++++++++++++++ vllm/engine/arg_utils.py | 5 + vllm/entrypoints/openai/api_server.py | 81 ++++++ vllm/entrypoints/openai/cli_args.py | 5 + vllm/entrypoints/openai/protocol.py | 77 +++++ vllm/entrypoints/openai/serving_engine.py | 4 + vllm/entrypoints/openai/serving_tokens.py | 269 ++++++++++++++++++ vllm/sampling_params.py | 2 + vllm/v1/engine/__init__.py | 8 +- vllm/v1/serial_utils.py | 65 ++++- 12 files changed, 822 insertions(+), 9 deletions(-) create mode 100644 examples/online_serving/token_generation_client.py create mode 100644 tests/entrypoints/openai/test_serving_tokens.py create mode 100644 vllm/entrypoints/openai/serving_tokens.py diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py new file mode 100644 index 000000000000..88ee43c5d9cd --- /dev/null +++ b/examples/online_serving/token_generation_client.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import httpx +from transformers import AutoTokenizer + +GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate" +DUMMY_API_KEY = "empty" +MODEL_NAME = "Qwen/Qwen3-0.6B" + +transport = httpx.HTTPTransport() +headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"} +client = httpx.Client( + transport=transport, + base_url=GEN_ENDPOINT, + timeout=600, + headers=headers, +) +messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "How many countries are in the EU?"}, +] + + +def main(client): + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + token_ids = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + enable_thinking=False, + ) + payload = { + "model": MODEL_NAME, + "token_ids": token_ids, + "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False}, + "stream": False, + } + resp = client.post(GEN_ENDPOINT, json=payload) + resp.raise_for_status() + data = resp.json() + print(data) + print("-" * 50) + print("Token generation results:") + res = tokenizer.decode(data["choices"][0]["token_ids"]) + print(res) + print("-" * 50) + + +if __name__ == "__main__": + main(client) diff --git a/requirements/docs.txt b/requirements/docs.txt index 0fd6dbe22c51..32e004b2b64b 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -10,3 +10,7 @@ mkdocs-minify-plugin regex ruff pydantic + +# For generating argparse docs. +# Adding requirements here should only be used as a last resort. +msgspec # Need for multiple inheritance involving msgspec.Struct \ No newline at end of file diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py new file mode 100644 index 000000000000..62d843e35b86 --- /dev/null +++ b/tests/entrypoints/openai/test_serving_tokens.py @@ -0,0 +1,262 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import httpx +import pytest +import pytest_asyncio +from transformers import AutoTokenizer + +from vllm.config import ModelConfig +from vllm.v1.engine.detokenizer import check_stop_strings + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-0.6B" +GEN_ENDPOINT = "/inference/v1/generate" + + +def get_vocab_size(model_name): + config = ModelConfig( + model=model_name, + seed=0, + dtype="bfloat16", + ) + return config.get_vocab_size() + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(MODEL_NAME) + + +@pytest.fixture(scope="module") +def messages(): + return [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "How many countries are in the EU?"}, + ] + + +@pytest.fixture(scope="module") +def server(request): + args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "1024", + "--enforce-eager", + ] + + extra_args = getattr(request, "param", None) + if extra_args is not None: + args = args + ( + list(extra_args) + if isinstance(extra_args, (list, tuple)) + else [str(extra_args)] + ) + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server: RemoteOpenAIServer): + transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + async with httpx.AsyncClient( + transport=transport, + base_url=server.url_root, + timeout=600, + headers=headers, + ) as c: + yield c + + +@pytest.mark.asyncio +async def test_generate_endpoint(client): + payload = { + "model": MODEL_NAME, + "token_ids": [1, 2, 3], + "sampling_params": {"max_tokens": 5}, + "stream": False, + } + resp = await client.post(GEN_ENDPOINT, json=payload) + resp.raise_for_status() + data = resp.json() + assert "choices" in data + + +@pytest.mark.asyncio +async def test_same_response_as_chat_completions(client, tokenizer, messages): + token_ids = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + enable_thinking=False, # default with Qwen3 + ) + for ignore_eos in [True, False]: + payload = { + "model": MODEL_NAME, + "token_ids": token_ids, + "sampling_params": { + "max_tokens": 24, + "temperature": 0.0, + # NOTE coordinator will set this to skip detokenization + "detokenize": False, + "ignore_eos": ignore_eos, + }, + "stream": False, + } + generate_resp = await client.post(GEN_ENDPOINT, json=payload) + generate_data = generate_resp.json() + generate_res = tokenizer.decode( + generate_data["choices"][0]["token_ids"], skip_special_tokens=True + ) + + payload = { + "model": MODEL_NAME, + "messages": messages, + "max_tokens": 24, + "temperature": 0.0, + "stream": False, + "ignore_eos": ignore_eos, + "chat_template_kwargs": dict(enable_thinking=False), + } + completions_resp = await client.post("/v1/chat/completions", json=payload) + completions_data = completions_resp.json() + completions_res = completions_data["choices"][0]["message"]["content"] + + assert generate_res == completions_res + + +@pytest.mark.asyncio +async def test_stop_string_workflow(client, tokenizer, messages): + token_ids = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + enable_thinking=False, # default with Qwen3 + ) + payload = { + "model": MODEL_NAME, + "token_ids": token_ids, + "sampling_params": { + "max_tokens": 24, + "temperature": 0.0, + "detokenize": False, + # stop strings are only supported when detokenize is True. + "stop": ["27 member"], + }, + # TODO stream test is much more interesting + "stream": False, + } + with pytest.raises(httpx.HTTPStatusError): + generate_resp = await client.post(GEN_ENDPOINT, json=payload) + generate_resp.raise_for_status() + + payload["sampling_params"]["stop"] = None + generate_resp = await client.post( + GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"} + ) + generate_data = generate_resp.json() + generate_res = tokenizer.decode( + generate_data["choices"][0]["token_ids"], skip_special_tokens=True + ) + + # NOTE This is under the responsibility of the coordinator + # stop_checker = StopChecker( + # max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer + # ) + stop_str, truncate_to = check_stop_strings( + generate_res, len(generate_res), ["27 member"], False + ) + assert stop_str == "27 member" + # abort request that hit stop string (requires tokens-only mode) + # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501 + # res.raise_for_status() + generate_res = generate_res[:truncate_to] + + # Get stop_str response from chat completions + payload = { + "model": MODEL_NAME, + "messages": messages, + "max_tokens": 24, + "temperature": 0.0, + "stream": False, + "stop": ["27 member"], + "chat_template_kwargs": dict(enable_thinking=False), + } + completions_resp = await client.post("/v1/chat/completions", json=payload) + completions_data = completions_resp.json() + completions_res = completions_data["choices"][0]["message"]["content"] + assert generate_res == completions_res + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "server", + [ + [ + "--enable-lora", + "--lora-modules", + "Alice=charent/self_cognition_Alice", + "Bob=charent/self_cognition_Bob", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + ] + ], + indirect=True, +) +async def test_generate_with_lora_adapter(client, tokenizer, messages): + # Verify adapters are listed + models_resp = await client.get("/v1/models") + models_resp.raise_for_status() + models = {m["id"] for m in models_resp.json().get("data", [])} + assert {"Alice", "Bob"}.issubset(models) + + # Generate using a LoRA adapter by specifying its name as the model + payload = { + "model": "Alice", + "token_ids": [1, 2, 3], + "sampling_params": {"max_tokens": 5}, + "stream": False, + } + resp = await client.post(GEN_ENDPOINT, json=payload) + resp.raise_for_status() + data = resp.json() + assert "choices" in data + + token_ids = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + enable_thinking=False, # default with Qwen3 + ) + payload = { + "model": "Alice", + "token_ids": token_ids, + "sampling_params": { + "max_tokens": 24, + "temperature": 0.0, + "detokenize": False, + }, + "stream": False, + } + generate_resp = await client.post(GEN_ENDPOINT, json=payload) + generate_data = generate_resp.json() + generate_res = tokenizer.decode( + generate_data["choices"][0]["token_ids"], skip_special_tokens=True + ) + + payload = { + "model": "Alice", + "messages": messages, + "max_tokens": 24, + "temperature": 0.0, + "stream": False, + "chat_template_kwargs": dict(enable_thinking=False), + } + completions_resp = await client.post("/v1/chat/completions", json=payload) + completions_data = completions_resp.json() + completions_res = completions_data["choices"][0]["message"]["content"] + + assert generate_res == completions_res diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cacebc530b6e..999ed780c20b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -566,6 +566,7 @@ class EngineArgs: kv_offloading_backend: KVOffloadingBackend | None = ( CacheConfig.kv_offloading_backend ) + tokens_only: bool = False def __post_init__(self): # support `EngineArgs(compilation_config={...})` @@ -1495,6 +1496,10 @@ def create_engine_config( else ParallelConfig.data_parallel_rpc_port ) + if self.tokens_only and not model_config.skip_tokenizer_init: + model_config.skip_tokenizer_init = True + logger.info("Skipping tokenizer initialization for tokens-only mode.") + # Forward the deprecated CLI args to the EPLB config. if self.num_redundant_experts is not None: self.eplb_config.num_redundant_experts = self.num_redundant_experts diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f30c6ef2cd0a..3e59af717d95 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -65,6 +65,8 @@ EmbeddingResponse, ErrorInfo, ErrorResponse, + GenerateRequest, + GenerateResponse, IOProcessorResponse, PoolingBytesResponse, PoolingRequest, @@ -96,6 +98,7 @@ from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses from vllm.entrypoints.openai.serving_score import ServingScores from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization +from vllm.entrypoints.openai.serving_tokens import ServingTokens from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation, @@ -357,6 +360,10 @@ def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client +def generate_tokens(request: Request) -> ServingTokens | None: + return request.app.state.serving_tokens + + @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" @@ -1228,6 +1235,41 @@ async def is_scaling_elastic_ep(raw_request: Request): ] +@router.post( + "/inference/v1/generate", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def generate(request: GenerateRequest, raw_request: Request): + handler = generate_tokens(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support generate tokens API" + ) + try: + generator = await handler.serve_tokens(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, GenerateResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning_once( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -1629,6 +1671,31 @@ async def log_response(request: Request, call_next): ) app = sagemaker_standards.bootstrap(app) + # Optional endpoints + if args.tokens_only: + + @app.post("/abort_requests") + async def abort_requests(raw_request: Request): + """ + Abort one or more requests. To be used in a + Disaggregated Everything setup. + """ + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + request_ids = body.get("request_ids") + if request_ids is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'request_ids' in request body", + ) + # Abort requests in background + asyncio.create_task(engine_client(raw_request).abort(request_ids)) + return Response(status_code=200) return app @@ -1851,6 +1918,20 @@ async def init_app_state( if "generate" in supported_tasks else None ) + state.serving_tokens = ( + ServingTokens( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + log_error_stack=args.log_error_stack, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_log_outputs=args.enable_log_outputs, + force_no_detokenize=args.tokens_only, + ) + if "generate" in supported_tasks + else None + ) state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 476587c17823..946362ce2ef0 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -189,6 +189,11 @@ class FrontendArgs: Helps mitigate header abuse. Default: 256.""" log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE """If set to True, log the stack trace of error responses""" + tokens_only: bool = False + """ + If set to True, only enable the Tokens In<>Out endpoint. + This is intended for use in a Disaggregated Everything setup. + """ @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 45584df8b9e2..65bd15ba387b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3220,3 +3220,80 @@ class TranslationResponseVerbose(OpenAIBaseModel): words: list[TranslationWord] | None = None """Extracted words and their corresponding timestamps.""" + + +####### Tokens IN <> Tokens OUT ####### +class GenerateRequest(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + token_ids: list[int] + """The token ids to generate text from.""" + + # features: MultiModalFeatureSpec + # TODO (NickLucche): implement once Renderer work is completed + features: str | None = None + """The processed MM inputs for the model.""" + + sampling_params: SamplingParams + """The sampling parameters for the model.""" + + model: str | None = None + + stream: bool | None = False + stream_options: StreamOptions | None = None + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + +class GenerateResponseChoice(BaseModel): + index: int + logprobs: ChatCompletionLogProbs | None = None + # per OpenAI spec this is the default + finish_reason: str | None = "stop" + token_ids: list[int] | None = None + + +class GenerateResponse(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + choices: list[GenerateResponseChoice] + + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 03f10e5a91e6..c50b0c4a23e1 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -58,6 +58,8 @@ ErrorResponse, FunctionCall, FunctionDefinition, + GenerateRequest, + GenerateResponse, IOProcessorRequest, PoolingResponse, RerankRequest, @@ -134,6 +136,7 @@ | SpeechToTextRequest | ResponsesRequest | IOProcessorRequest + | GenerateRequest ) AnyResponse: TypeAlias = ( @@ -145,6 +148,7 @@ | PoolingResponse | ClassificationResponse | ScoreResponse + | GenerateResponse ) diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py new file mode 100644 index 000000000000..69a526b9b70d --- /dev/null +++ b/vllm/entrypoints/openai/serving_tokens.py @@ -0,0 +1,269 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import time +from collections.abc import AsyncGenerator +from collections.abc import Sequence as GenericSequence + +from fastapi import Request + +# yapf: disable +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionLogProb, + ChatCompletionLogProbs, + ChatCompletionLogProbsContent, + ErrorResponse, + GenerateRequest, + GenerateResponse, + GenerateResponseChoice, + PromptTokenUsageInfo, + RequestResponseMetadata, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.utils.collection_utils import as_list + +logger = init_logger(__name__) + + +class ServingTokens(OpenAIServing): + """Provides Tokens IN <> Tokens OUT functionality to vLLM API.""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + force_no_detokenize: bool = False, + return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, + enable_prompt_tokens_details: bool = False, + enable_log_outputs: bool = False, + ): + super().__init__(engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack) + self.enable_prompt_tokens_details = enable_prompt_tokens_details + self.enable_log_outputs = enable_log_outputs + self.force_no_detokenize = force_no_detokenize + if force_no_detokenize: + logger.info("Tokens-only mode is enabled, skipping detokenization " + "step for incoming requests.") + + async def serve_tokens( + self, + request: GenerateRequest, + raw_request: Request | None = None + ) -> GenerateResponse | ErrorResponse: + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + lora_request = None + lora_request = self._maybe_get_adapters(request, + supports_default_mm_loras=True) + + model_name = self.models.model_name(lora_request) + + request_id = "generate-tokens-" \ + f"{self._base_request_id(raw_request, request.request_id)}" + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is + # completed + engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids) + if request.features is not None: + engine_prompt["multi_modal_data"] = None + + if hasattr(request, "cache_salt") and request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + # Schedule the request and get the result generator. + result_generator: AsyncGenerator[RequestOutput, None] | None = None + try: + sampling_params = request.sampling_params + if self.force_no_detokenize: + sampling_params.detokenize = False + + self._log_inputs(request_id, + request.token_ids, + params=sampling_params, + lora_request=lora_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + result_generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + except ValueError as e: + return self.create_error_response(str(e)) + + # TODO(NickLucche): Implement streaming response + + try: + assert result_generator is not None + return await self.serve_tokens_full_generator( + request, result_generator, request_id, model_name, + request_metadata) + except ValueError as e: + return self.create_error_response(str(e)) + + async def serve_tokens_full_generator( + self, + request: GenerateRequest, + result_generator: AsyncGenerator[RequestOutput, None], + request_id: str, + model_name: str, + request_metadata: RequestResponseMetadata, + ) -> ErrorResponse | GenerateResponse: + + created_time = int(time.time()) + final_res: RequestOutput | None = None + sampling_params: SamplingParams = request.sampling_params + + try: + async for res in result_generator: + final_res = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + return self.create_error_response(str(e)) + + assert final_res is not None + + choices: list[GenerateResponseChoice] = [] + num_generated_tokens = 0 + for output in final_res.outputs: + token_ids = output.token_ids + out_logprobs = output.logprobs + + # This is top_logprobs in completions API + if sampling_params.logprobs: + assert out_logprobs is not None, "Did not output logprobs" + logprobs = self._create_tokens_logprobs( + token_ids=token_ids, + top_logprobs=out_logprobs, + num_output_top_logprobs=sampling_params.logprobs, + ) + else: + logprobs = None + + choice_data = GenerateResponseChoice( + index=output.index, + logprobs=logprobs, + finish_reason=output.finish_reason + if output.finish_reason else "stop", + token_ids=as_list(output.token_ids)) + + choices.append(choice_data) + num_generated_tokens += len(output.token_ids) + + assert final_res.prompt_token_ids is not None + num_prompt_tokens = len(final_res.prompt_token_ids) + if final_res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(final_res.encoder_prompt_token_ids) + + usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + + num_generated_tokens) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + # This info is not available at the /coordinator level + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens) + + request_metadata.final_usage_info = usage + + response = GenerateResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs), + kv_transfer_params=final_res.kv_transfer_params, + ) + + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: + # Get the corresponding output token IDs + output_token_ids = None + if choice.index < len(final_res.outputs): + output_token_ids = final_res.outputs[ + choice.index].token_ids + + if output_token_ids: + # Log token_ids only. + self.request_logger.log_outputs( + request_id=request_id, + outputs="", + output_token_ids=output_token_ids, + finish_reason=choice.finish_reason, + is_streaming=False, + delta=False, + ) + + return response + + def _create_tokens_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[dict[int, Logprob] | None], + num_output_top_logprobs: int | None = None, + ) -> ChatCompletionLogProbs: + """Create OpenAI-style logprobs.""" + logprobs_content: list[ChatCompletionLogProbsContent] = [] + + for i, token_id in enumerate(token_ids): + token = f"token_id:{token_id}" + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None or step_top_logprobs.get( + token_id) is None: + logprobs_content.append( + ChatCompletionLogProbsContent(token=token, )) + else: + step_token = step_top_logprobs[token_id] + + logprobs_content.append( + ChatCompletionLogProbsContent( + token=token, + logprob=max(step_token.logprob, -9999.0), + top_logprobs=[ + ChatCompletionLogProb( + token=token, + logprob=max(p[1].logprob, -9999.0), + ) for i, p in enumerate(step_top_logprobs.items()) + if num_output_top_logprobs + and i < num_output_top_logprobs + ])) + + return ChatCompletionLogProbs(content=logprobs_content) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 4b2a3bc4dbaa..dd820840410e 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -15,6 +15,7 @@ from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.v1.serial_utils import PydanticMsgspecMixin logger = init_logger(__name__) @@ -122,6 +123,7 @@ class RequestOutputKind(Enum): class SamplingParams( + PydanticMsgspecMixin, msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] # required for @cached_property. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 058a4bcaecb5..3f621d77c024 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -15,6 +15,7 @@ from vllm.sampling_params import SamplingParams from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import LogprobsLists, LogprobsTensors +from vllm.v1.serial_utils import UtilityResult # These are possible values of RequestOutput.finish_reason, # so form part of the external API. @@ -131,13 +132,6 @@ def finished(self) -> bool: return self.finish_reason is not None -class UtilityResult: - """Wrapper for special handling when serializing/deserializing.""" - - def __init__(self, r: Any = None): - self.result = r - - class UtilityOutput( msgspec.Struct, array_like=True, # type: ignore[call-arg] diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index cf0b1a41b50f..0a6806390451 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -8,7 +8,7 @@ from functools import partial from inspect import isclass from types import FunctionType -from typing import Any, TypeAlias +from typing import Any, TypeAlias, get_type_hints import cloudpickle import msgspec @@ -16,6 +16,8 @@ import torch import zmq from msgspec import msgpack +from pydantic import GetCoreSchemaHandler +from pydantic_core import core_schema from vllm import envs from vllm.logger import init_logger @@ -32,7 +34,6 @@ NestedTensors, ) from vllm.utils.platform_utils import is_pin_memory_available -from vllm.v1.engine import UtilityResult from vllm.v1.utils import tensor_data logger = init_logger(__name__) @@ -104,6 +105,13 @@ def _decode_type_info_recursive( return convert_fn(type_info, data) +class UtilityResult: + """Wrapper for special handling when serializing/deserializing.""" + + def __init__(self, r: Any = None): + self.result = r + + class MsgpackEncoder: """Encoder with custom torch tensor and numpy array serialization. @@ -469,3 +477,56 @@ def run_method( else: func = partial(method, obj) # type: ignore return func(*args, **kwargs) + + +class PydanticMsgspecMixin: + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: GetCoreSchemaHandler + ) -> core_schema.CoreSchema: + """ + Make msgspec.Struct compatible with Pydantic, respecting defaults. + Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the + API as input or in `/docs`. Note this is cached by Pydantic and not + called on every validation. + """ + msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)} + type_hints = get_type_hints(source_type) + + # Build the Pydantic typed_dict_field for each msgspec field + fields = {} + for name, hint in type_hints.items(): + msgspec_field = msgspec_fields[name] + + # typed_dict_field using the handler to get the schema + field_schema = handler(hint) + + # Add default value to the schema. + if msgspec_field.default_factory is not msgspec.NODEFAULT: + wrapped_schema = core_schema.with_default_schema( + schema=field_schema, + default_factory=msgspec_field.default_factory, + ) + fields[name] = core_schema.typed_dict_field(wrapped_schema) + elif msgspec_field.default is not msgspec.NODEFAULT: + wrapped_schema = core_schema.with_default_schema( + schema=field_schema, + default=msgspec_field.default, + ) + fields[name] = core_schema.typed_dict_field(wrapped_schema) + else: + # No default, so Pydantic will treat it as required + fields[name] = core_schema.typed_dict_field(field_schema) + return core_schema.no_info_after_validator_function( + cls._validate_msgspec, + core_schema.typed_dict_schema(fields), + ) + + @classmethod + def _validate_msgspec(cls, value: Any) -> Any: + """Validate and convert input to msgspec.Struct instance.""" + if isinstance(value, cls): + return value + if isinstance(value, dict): + return cls(**value) + return msgspec.convert(value, type=cls) From 8cc40f89926f0f49d320c4ef078c70cf535c589e Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 14 Nov 2025 12:13:37 -0500 Subject: [PATCH 062/578] [Attention] Bump FA for removed method (#28429) Signed-off-by: Matthew Bonanni Co-authored-by: Cyrus Leung --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 29db9fa273a4..567c8959f045 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309 + GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From a17e36f2236a15012c8ddfedbf076a526de4b17b Mon Sep 17 00:00:00 2001 From: Mohammad Othman <48595863+OthmanMohammad@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:35:45 +0200 Subject: [PATCH 063/578] Fix typo in comment: existance -> existence (#28737) Signed-off-by: Mohammad Othman --- vllm/_aiter_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 5508e59bcd2f..7c35bf1857ba 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -30,7 +30,7 @@ def if_aiter_supported(func: Callable) -> Callable: @functools.wraps(func) def wrapper(*args, **kwargs): - # checks the platform, device arch and aiter library existance. + # checks the platform, device arch and aiter library existence. if current_platform.is_rocm() and IS_AITER_FOUND: from vllm.platforms.rocm import on_gfx9 From 085424808ef705efbf59e7b18bc010f53d9d7f75 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:54:38 +0100 Subject: [PATCH 064/578] Remove audio optional dependency for mistral-common (#28722) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: Cyrus Leung Co-authored-by: Cyrus Leung --- docs/contributing/model/transcription.md | 2 +- docs/models/supported_models.md | 3 +++ examples/offline_inference/audio_language.py | 1 + requirements/common.txt | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index a590ecd6a1a2..fca941acd507 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t ## Examples in-tree - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) -- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py) +- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`. - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) ## Test with the API diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0439e9cf2364..9cdf644c3cc5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -785,6 +785,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | +!!! note + `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed. + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 53d69bbdbdc7..04e6f99f8957 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple): # Voxtral +# Make sure to install mistral-common[audio]. def run_voxtral(question: str, audio_count: int) -> ModelRequestData: from mistral_common.audio import Audio from mistral_common.protocol.instruct.chunk import ( diff --git a/requirements/common.txt b/requirements/common.txt index 90efb79a845d..ad92ba3ad827 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.13.0 -mistral_common[image,audio] >= 1.8.5 +mistral_common[image] >= 1.8.5 opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 From cdd7025961cf79480f885804c21e7d60866fb33f Mon Sep 17 00:00:00 2001 From: czhu-cohere Date: Fri, 14 Nov 2025 12:59:11 -0500 Subject: [PATCH 065/578] [kernel] Improve FP8 PTPC on Hopper for larger shapes (#28692) Signed-off-by: czhu-cohere --- .../c3x/scaled_mm_sm90_fp8_dispatch.cuh | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh index 4ff3e65f2b2e..b8433214be1b 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh @@ -116,6 +116,26 @@ struct sm90_fp8_config_default { ClusterShape, KernelSchedule, EpilogueSchedule>>; }; +template +struct sm90_fp8_config_M8192_K6144 { + // M >= 8192, K >= 6144 + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum; + using EpilogueSchedule = + typename cutlass::epilogue::TmaWarpSpecializedCooperative; + using TileShape = Shape<_256, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + + using Cutlass3xGemm = conditional_t< + EnableBias, + cutlass_3x_gemm_sm90_fp8, + cutlass_3x_gemm_sm90_fp8>; +}; + template struct sm90_fp8_config_M128 { // M in (64, 128] @@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm90_fp8_config_default::Cutlass3xGemm; + using Cutlass3xGemmM8192_K6144 = + typename sm90_fp8_config_M8192_K6144::Cutlass3xGemm; using Cutlass3xGemmM128 = typename sm90_fp8_config_M128::Cutlass3xGemm; @@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, uint32_t const m = a.size(0); uint32_t const n = b.size(1); + uint32_t const k = a.size(1); if (m <= 16) { // m in [1, 16] @@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, // m in (64, 128] return cutlass_gemm_caller_sm90_fp8( out, a, b, a_scales, b_scales, std::forward(args)...); + } else if (m >= 8192 && k >= 6144) { + return cutlass_gemm_caller_sm90_fp8( + out, a, b, a_scales, b_scales, std::forward(args)...); } else { // m in (128, inf) return cutlass_gemm_caller_sm90_fp8( From 9261eb3dc19e985806a47ab2eb03035557f29c1f Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Fri, 14 Nov 2025 13:08:30 -0500 Subject: [PATCH 066/578] docs(lora_resolvers): clarify multi-resolver order and storage path requirement (#28153) Signed-off-by: Chen Wang Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .markdownlint.yaml | 2 + docs/.nav.yml | 5 +- docs/design/lora_resolver_plugins.md | 220 ++++++++++++++++++++++++++ vllm/plugins/lora_resolvers/README.md | 16 -- 4 files changed, 226 insertions(+), 17 deletions(-) create mode 100644 docs/design/lora_resolver_plugins.md delete mode 100644 vllm/plugins/lora_resolvers/README.md diff --git a/.markdownlint.yaml b/.markdownlint.yaml index cd9df57cd980..d0d3179766ef 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -3,6 +3,8 @@ MD007: MD013: false MD024: siblings_only: true +MD031: + list_items: false MD033: false MD045: false MD046: false diff --git a/docs/.nav.yml b/docs/.nav.yml index c103ed476d76..3151ea0e2ec2 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -46,7 +46,10 @@ nav: - contributing/model/multimodal.md - contributing/model/transcription.md - CI: contributing/ci - - Design Documents: design + - Design Documents: + - Plugins: + - design/*plugin*.md + - design/* - API Reference: - api/README.md - api/vllm diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md new file mode 100644 index 000000000000..bd0dc6dc9c7b --- /dev/null +++ b/docs/design/lora_resolver_plugins.md @@ -0,0 +1,220 @@ +# LoRA Resolver Plugins + +This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework. +They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts. + +## Overview + +LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM +receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt +to locate and load the adapter from their configured storage locations. This enables: + +- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts +- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source. +- **Automatic Discovery**: Seamless integration with existing LoRA workflows +- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances + +## Prerequisites + +Before using LoRA Resolver Plugins, ensure the following environment variables are configured: + +### Required Environment Variables + +1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading + ```bash + export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true + ``` + +2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list) + ```bash + export VLLM_PLUGINS=lora_filesystem_resolver + ``` + +3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver + ```bash + export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters + ``` + +### Optional Environment Variables + +- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded. + +## Available Resolvers + +### lora_filesystem_resolver + +The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure. + +#### Setup Steps + +1. **Create the LoRA adapter storage directory**: + ```bash + mkdir -p /path/to/lora/adapters + ``` + +2. **Set environment variables**: + ```bash + export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true + export VLLM_PLUGINS=lora_filesystem_resolver + export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters + ``` + +3. **Start vLLM server**: + Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`. + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model your-base-model \ + --enable-lora + ``` + +#### Directory Structure Requirements + +The filesystem resolver expects LoRA adapters to be organized in the following structure: + +```text +/path/to/lora/adapters/ +├── adapter1/ +│ ├── adapter_config.json +│ ├── adapter_model.bin +│ └── tokenizer files (if applicable) +├── adapter2/ +│ ├── adapter_config.json +│ ├── adapter_model.bin +│ └── tokenizer files (if applicable) +└── ... +``` + +Each adapter directory must contain: + +- **`adapter_config.json`**: Required configuration file with the following structure: + ```json + { + "peft_type": "LORA", + "base_model_name_or_path": "your-base-model-name", + "r": 16, + "lora_alpha": 32, + "target_modules": ["q_proj", "v_proj"], + "bias": "none", + "modules_to_save": null, + "use_rslora": false, + "use_dora": false + } + ``` + +- **`adapter_model.bin`**: The LoRA adapter weights file + +#### Usage Example + +1. **Prepare your LoRA adapter**: + ```bash + # Assuming you have a LoRA adapter in /tmp/my_lora_adapter + cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter + ``` + +2. **Verify the directory structure**: + ```bash + ls -la /path/to/lora/adapters/my_sql_adapter/ + # Should show: adapter_config.json, adapter_model.bin, etc. + ``` + +3. **Make a request using the adapter**: + ```bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my_sql_adapter", + "prompt": "Generate a SQL query for:", + "max_tokens": 50, + "temperature": 0.1 + }' + ``` + +#### How It Works + +1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter` +2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists +3. If found, it validates the `adapter_config.json` file +4. If the configuration matches the base model and is valid, the adapter is loaded +5. The request is processed normally with the newly loaded adapter +6. The adapter remains available for future requests + +## Advanced Configuration + +### Multiple Resolvers + +You can configure multiple resolver plugins to load adapters from different sources: + +'lora_s3_resolver' is an example of a custom resolver you would need to implement + +```bash +export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver +``` + +All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds. + +### Custom Resolver Implementation + +To implement your own resolver plugin: + +1. **Create a new resolver class**: + ```python + from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry + from vllm.lora.request import LoRARequest + + class CustomResolver(LoRAResolver): + async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]: + # Your custom resolution logic here + pass + ``` + +2. **Register the resolver**: + ```python + def register_custom_resolver(): + resolver = CustomResolver() + LoRAResolverRegistry.register_resolver("Custom Resolver", resolver) + ``` + +## Troubleshooting + +### Common Issues + +1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"** + - Ensure the directory exists and is accessible + - Check file permissions on the directory + +2. **"LoRA adapter not found"** + - Verify the adapter directory name matches the requested model name + - Check that `adapter_config.json` exists and is valid JSON + - Ensure `adapter_model.bin` exists in the directory + +3. **"Invalid adapter configuration"** + - Verify `peft_type` is set to "LORA" + - Check that `base_model_name_or_path` matches your base model + - Ensure `target_modules` is properly configured + +4. **"LoRA rank exceeds maximum"** + - Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting + +### Debugging Tips + +1. **Enable debug logging**: + ```bash + export VLLM_LOGGING_LEVEL=DEBUG + ``` + +2. **Verify environment variables**: + ```bash + echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING + echo $VLLM_PLUGINS + echo $VLLM_LORA_RESOLVER_CACHE_DIR + ``` + +3. **Test adapter configuration**: + ```bash + python -c " + import json + with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f: + config = json.load(f) + print('Config valid:', config) + " + ``` diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md deleted file mode 100644 index 48f27dddea07..000000000000 --- a/vllm/plugins/lora_resolvers/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# LoRA Resolver Plugins - -This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters -via the LoRAResolver plugin framework. - -Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins -to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins. - -## lora_filesystem_resolver - -This LoRA Resolver is installed with vLLM by default. -To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request -for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory -for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will -load that adapter, and then service the request as normal. That adapter will then be available -for future requests as normal. From 964d65deedb9ae0480fecdb2e726ba16d63409d7 Mon Sep 17 00:00:00 2001 From: Fardin Hoque Date: Fri, 14 Nov 2025 10:27:56 -0800 Subject: [PATCH 067/578] LLaMA4 LoRA Adapter Enablement (#28602) Signed-off-by: Fardin Hoque Co-authored-by: Wei Wei --- vllm/model_executor/models/mllama4.py | 36 +++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 14e741f32258..e25a104d822a 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -35,6 +35,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -45,6 +46,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.utils import initialize_model from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -68,11 +70,15 @@ MixtureOfExperts, MultiModalEmbeddings, SupportsEagle3, + SupportsLoRA, SupportsMultiModal, SupportsPP, ) from .llama4 import Llama4ForCausalLM -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import ( + AutoWeightsLoader, + maybe_prefix, +) from .vision import run_dp_sharded_vision_model @@ -724,7 +730,12 @@ def get_dummy_mm_data( dummy_inputs=Mllama4DummyInputsBuilder, ) class Llama4ForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, MixtureOfExperts, SupportsEagle3 + nn.Module, + SupportsMultiModal, + SupportsPP, + MixtureOfExperts, + SupportsEagle3, + SupportsLoRA, ): merge_by_field_config = True @@ -1067,6 +1078,17 @@ def _load_other_weights( return updated_params + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.text_config.num_local_experts, + num_redundant_experts=self.num_redundant_experts, + ) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -1113,3 +1135,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) return updated_params + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector.", + tower_model="vision_model.", + ) From a425dc256e4c2f76f98be136cd898b43f02e6a32 Mon Sep 17 00:00:00 2001 From: TJian Date: Fri, 14 Nov 2025 10:30:50 -0800 Subject: [PATCH 068/578] [Bugfix] [ROCm] [AITER]: Fix aiter block quant not compatible with torch compile dynamo (#28716) Signed-off-by: tjtanaa --- tests/rocm/aiter/test_grouped_quant.py | 137 ++++++++++++++++++ vllm/_aiter_ops.py | 48 +++++- .../layers/quantization/utils/fp8_utils.py | 2 +- 3 files changed, 180 insertions(+), 7 deletions(-) create mode 100644 tests/rocm/aiter/test_grouped_quant.py diff --git a/tests/rocm/aiter/test_grouped_quant.py b/tests/rocm/aiter/test_grouped_quant.py new file mode 100644 index 000000000000..c7f0f1eda355 --- /dev/null +++ b/tests/rocm/aiter/test_grouped_quant.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# This is a test for the AITER group_fp8_quant op. +# It tests if the AITER op is +# 1. correctly defined the relationship between +# implementation and fake function +# 2. can be used with torch.compile +# 3. can be used with CUDA graphs +# This file will be skipped if AITER is not installed +# and the platform is not ROCm. + +import importlib.util + +import pytest +import torch + +# this import statement is needed to ensure the ops are registered +from vllm._aiter_ops import rocm_aiter_ops +from vllm.platforms import current_platform + +# Check if aiter package is installed +aiter_available = importlib.util.find_spec("aiter") is not None + +pytestmark = pytest.mark.skipif( + not (current_platform.is_rocm() and aiter_available), + reason="AITER ops are only available on ROCm with aiter package installed", +) + + +def test_rocm_aiter_group_fp8_quant_fake_implementation(): + """Test that the fake implementation is correctly + defined for torch.ops.vllm.rocm_aiter_group_fp8_quant.""" + # Create test tensors + M = 128 + N = 4096 + group_size = 128 + + input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda") + + # Verify the op's fake implementation using torch.library.opcheck + # This checks that the fake function returns tensors with correct shapes and dtypes + torch.library.opcheck( + torch.ops.vllm.rocm_aiter_group_fp8_quant, + (input_tensor, group_size), + test_utils=("test_faketensor",), + ) + + +def test_rocm_aiter_group_fp8_quant_torch_compile_with_cudagraph(): + """Test that rocm_aiter_ops.group_fp8_quant + with group size 128 can be used with + torch.compile in cudagraph mode.""" + # Create test tensors + M = 128 + N = 4096 + group_size = 128 + + input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda") + + # Define a function that uses the op + def group_fp8_quant_fn(x): + return rocm_aiter_ops.group_fp8_quant(x, group_size) + + # Compile with cudagraph mode + compiled_fn = torch.compile( + group_fp8_quant_fn, + fullgraph=True, + backend="inductor", + mode="reduce-overhead", + dynamic=False, + ) + + # Run eager mode + x_fp8_eager, scales_eager = group_fp8_quant_fn(input_tensor) + + # Run compiled version (first run will trigger compilation) + x_fp8_compiled, scales_compiled = compiled_fn(input_tensor) + + # Verify shapes match + assert x_fp8_compiled.shape == x_fp8_eager.shape + assert scales_compiled.shape == scales_eager.shape + + # Verify expected shapes + assert x_fp8_compiled.shape == (M, N) + expected_scale_cols = (N + group_size - 1) // group_size + assert scales_compiled.shape == (M, expected_scale_cols) + + # Verify results match + assert torch.allclose( + x_fp8_compiled.to(torch.float32), + x_fp8_eager.to(torch.float32), + rtol=1e-2, + atol=1e-2, + ) + assert torch.allclose(scales_compiled, scales_eager, rtol=1e-3, atol=1e-3) + + # Test with different input (reusing compiled graph) + input_tensor_2 = torch.randn((M, N), dtype=torch.bfloat16, device="cuda") + x_fp8_eager_2, scales_eager_2 = group_fp8_quant_fn(input_tensor_2) + x_fp8_compiled_2, scales_compiled_2 = compiled_fn(input_tensor_2) + + # Verify second run also produces correct results + assert torch.allclose( + x_fp8_compiled_2.to(torch.float32), + x_fp8_eager_2.to(torch.float32), + rtol=1e-2, + atol=1e-2, + ) + assert torch.allclose(scales_compiled_2, scales_eager_2, rtol=1e-3, atol=1e-3) + + +def test_rocm_aiter_group_fp8_quant_different_shapes(): + """Test rocm_aiter_ops.group_fp8_quant with different input shapes.""" + group_size = 128 + + test_shapes = [ + (64, 2048), + (256, 8192), + (32, 1024), + (512, 4096), + ] + + for M, N in test_shapes: + input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda") + + x_fp8, scales = rocm_aiter_ops.group_fp8_quant(input_tensor, group_size) + + # Verify shapes + assert x_fp8.shape == (M, N) + expected_scale_cols = (N + group_size - 1) // group_size + assert scales.shape == (M, expected_scale_cols) + + # Verify dtypes + from aiter import dtypes + + assert x_fp8.dtype == dtypes.fp8 + assert scales.dtype == torch.float32 diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 7c35bf1857ba..e53e4ae6e529 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -43,6 +43,36 @@ def wrapper(*args, **kwargs): return wrapper +def _rocm_aiter_group_fp8_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" + from aiter import QuantType, dtypes, get_hip_quant + + aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) + return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8) + + +def _rocm_aiter_group_fp8_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter import dtypes + + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + def _rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -512,6 +542,14 @@ def register_ops_once() -> None: ) # register all the custom ops here + direct_register_custom_op( + op_name="rocm_aiter_group_fp8_quant", + op_func=_rocm_aiter_group_fp8_quant_impl, + mutates_args=[], + fake_impl=_rocm_aiter_group_fp8_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + direct_register_custom_op( op_name="rocm_aiter_asm_moe_tkw1", op_func=_rocm_aiter_asm_moe_tkw1_impl, @@ -887,14 +925,12 @@ def triton_gemm_a8w8_blockscale( return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype) @staticmethod - def per_1x128_fp8_quant( + def group_fp8_quant( input_2d: torch.Tensor, + group_size: int = 128, ) -> tuple[torch.Tensor, ...]: - """Only applies quantization method for fp8 data type only.""" - from aiter import QuantType, dtypes, get_hip_quant - - aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) - return aiter_per1x128_quant(input_2d.contiguous(), quant_dtype=dtypes.fp8) + assert group_size == 128, "Group size must be 128" + return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size) @staticmethod def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool: diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 541c6c631053..ae63b4a76726 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -342,7 +342,7 @@ def _run_aiter( ) # MI300 uses tuned AITER ASM/C++ kernel else: - q_input, input_scale = rocm_aiter_ops.per_1x128_fp8_quant(input_2d) + q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d) return gemm_a8w8_blockscale_op( q_input, From 67187554dd478ba76e79d7a6f8bf02be01290de3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:39:19 +0000 Subject: [PATCH 069/578] [Docs] Enable some more markdown lint rules for the docs (#28731) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .markdownlint.yaml | 3 --- docs/contributing/benchmarks.md | 2 -- docs/contributing/ci/update_pytorch_version.md | 2 +- docs/deployment/frameworks/chatbox.md | 4 ++-- docs/deployment/frameworks/dify.md | 6 +++--- docs/design/fused_moe_modular_kernel.md | 8 ++++---- 6 files changed, 10 insertions(+), 15 deletions(-) diff --git a/.markdownlint.yaml b/.markdownlint.yaml index d0d3179766ef..937487f47364 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -6,9 +6,6 @@ MD024: MD031: list_items: false MD033: false -MD045: false MD046: false -MD051: false MD052: false -MD053: false MD059: false diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index dca01eab5b42..ec0dfc4199d1 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development -[Benchmark CLI]: #benchmark-cli - ## Benchmark CLI This section guides you through running benchmark tests with the extensive diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index f983c25f26ee..09fd85a466ee 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin to warm it up so that future builds are faster.

- + Buildkite new build popup

## Update dependencies diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md index 002935da5600..5f7cef1a87df 100644 --- a/docs/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -29,8 +29,8 @@ pip install vllm - API Path: `/chat/completions` - Model: `qwen/Qwen1.5-0.5B-Chat` - ![](../../assets/deployment/chatbox-settings.png) + ![Chatbox settings screen](../../assets/deployment/chatbox-settings.png) 1. Go to `Just chat`, and start to chat: - ![](../../assets/deployment/chatbox-chat.png) + ![Chatbot chat screen](../../assets/deployment/chatbox-chat.png) diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index 820ef0cbed9f..673cbf4b6a24 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - **Completion Mode**: `Completion` - ![](../../assets/deployment/dify-settings.png) + ![Dify settings screen](../../assets/deployment/dify-settings.png) 1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: - ![](../../assets/deployment/dify-create-chatbot.png) + ![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png) 1. Click the chatbot you just created to open the chat interface and start interacting with the model: - ![](../../assets/deployment/dify-chat.png) + ![Dify chat screen](../../assets/deployment/dify-chat.png) diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 76df0d8d8a38..e1a96be6c344 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below -![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched") +![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png) -![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched") +![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png) !!! note The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain. @@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions. The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) -![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks") +![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png) ### FusedMoEPermuteExpertsUnpermute @@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section. `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use. -![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks") +![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png) ### FusedMoEModularKernel From e2741f6cbce6dc4c364d0a8d77375259d72a21ef Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Nov 2025 02:39:57 +0800 Subject: [PATCH 070/578] [Chore] Rename `SchedulerConfig.chunked_prefill_enabled` (#28735) Signed-off-by: DarkLight1337 --- tests/v1/core/test_scheduler.py | 1 - tests/v1/e2e/test_spec_decode.py | 10 ++++------ tests/v1/engine/test_engine_core.py | 2 +- vllm/config/scheduler.py | 11 ++++++++--- vllm/config/vllm.py | 6 +++--- vllm/platforms/cpu.py | 2 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 9 files changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 287e735b5491..04e738293cd7 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder( ) -> None: """Validate chunked prefill settings in the scheduler config for encoder-decoder models.""" - assert scheduler_config.chunked_prefill_enabled is expect_enabled assert scheduler_config.enable_chunked_prefill is expect_enabled if is_encoder_decoder: # Encoder-decoder models should automatically disable chunked multimodal diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 4a6b84ae4817..6cffaafb127e 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -272,7 +272,7 @@ def test_speculators_model_integration( @pytest.mark.parametrize( - ["model_setup", "mm_enabled", "chunked_prefill_enabled"], + ["model_setup", "mm_enabled", "enable_chunked_prefill"], [ (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), pytest.param( @@ -358,7 +358,7 @@ def test_eagle_correctness( sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], mm_enabled: bool, - chunked_prefill_enabled: bool, + enable_chunked_prefill: bool, attn_backend: str, ): if attn_backend == "TREE_ATTN": @@ -396,9 +396,7 @@ def test_eagle_correctness( method, model_name, spec_model_name, tp_size = model_setup max_model_len = 2048 - max_num_batched_tokens = max_model_len - if chunked_prefill_enabled: - max_num_batched_tokens = 128 + max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len ref_llm = LLM( model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size @@ -420,7 +418,7 @@ def test_eagle_correctness( }, max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=chunked_prefill_enabled, + enable_chunked_prefill=enable_chunked_prefill, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 4e852dca95eb..3ba8ab26f552 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache( ) # Check 5: Verify chunked prefill is disabled - assert not vllm_config.scheduler_config.chunked_prefill_enabled, ( + assert not vllm_config.scheduler_config.enable_chunked_prefill, ( "Encoder instance should disable chunked prefill (no KV cache)" ) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 5117344a6844..444568994a95 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -8,7 +8,7 @@ from pydantic import Field, field_validator, model_validator from pydantic.dataclasses import dataclass -from typing_extensions import Self +from typing_extensions import Self, deprecated from vllm.config.utils import config from vllm.logger import init_logger @@ -233,6 +233,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: ) @property + @deprecated( + "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " + "`SchedulerConfig.enable_chunked_prefill`. " + "The old name will be removed in v0.12." + ) def chunked_prefill_enabled(self) -> bool: return self.enable_chunked_prefill @@ -244,7 +249,7 @@ def chunked_prefill_enabled(self, value: bool): def _verify_args(self) -> Self: if ( self.max_num_batched_tokens < self.max_model_len - and not self.chunked_prefill_enabled + and not self.enable_chunked_prefill ): raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " @@ -271,7 +276,7 @@ def _verify_args(self) -> Self: ) if self.max_num_partial_prefills > 1: - if not self.chunked_prefill_enabled: + if not self.enable_chunked_prefill: raise ValueError( "Chunked prefill must be enabled to set " "max_num_partial_prefills > 1." diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f581267f73f7..1e6e455210c8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -411,7 +411,7 @@ def __post_init__(self): if ( self.model_config is not None - and self.scheduler_config.chunked_prefill_enabled + and self.scheduler_config.enable_chunked_prefill and self.model_config.dtype == torch.float32 and current_platform.get_device_capability() == (7, 5) ): @@ -584,7 +584,7 @@ def __post_init__(self): ): for reason in disable_chunked_prefill_reasons: logger.info(reason) - self.scheduler_config.chunked_prefill_enabled = False + self.scheduler_config.enable_chunked_prefill = False self.scheduler_config.long_prefill_token_threshold = 0 if self.cache_config is not None: @@ -1026,7 +1026,7 @@ def __str__(self): f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " - f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa + f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}" ) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index fdfa1c19789c..1da34629472c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: scheduler_config = vllm_config.scheduler_config if ( - scheduler_config.chunked_prefill_enabled + scheduler_config.enable_chunked_prefill or cache_config.enable_prefix_caching ) and cache_config.cache_dtype != "auto": raise RuntimeError( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4fcc7955df19..ba7ad0c09173 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput: # chunked prefill has to be enabled explicitly to allow # pooling requests to be chunked if ( - not self.scheduler_config.chunked_prefill_enabled + not self.scheduler_config.enable_chunked_prefill and num_new_tokens > token_budget ): self.waiting.pop_request() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ffb5232e770d..a6965182fc2c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -124,7 +124,7 @@ def __init__( # Encoder models without KV cache don't support # chunked prefill. But do SSM models? logger.info("Disabling chunked prefill for model without KVCache") - vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.enable_chunked_prefill = False scheduler_block_size = ( vllm_config.cache_config.block_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 341bf58f2da8..9b3e5b668aab 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: supported_tasks = list(model.pooler.get_supported_tasks()) - if self.scheduler_config.chunked_prefill_enabled: + if self.scheduler_config.enable_chunked_prefill: if "token_embed" in supported_tasks: supported_tasks.remove("token_embed") if "token_classify" in supported_tasks: @@ -3825,7 +3825,7 @@ def _dummy_pooler_run( supported_pooling_tasks = self.get_supported_pooling_tasks() if not supported_pooling_tasks: - if self.scheduler_config.chunked_prefill_enabled: + if self.scheduler_config.enable_chunked_prefill: raise RuntimeError( f"Model {self.model_config.model} does not support " "any pooling tasks with chunked prefill enabled. " From cec275efcef62a9fb6ea5c3445572dddf9736206 Mon Sep 17 00:00:00 2001 From: GuanH <60228748+GuanH@users.noreply.github.com> Date: Sat, 15 Nov 2025 02:44:27 +0800 Subject: [PATCH 071/578] [Bugfix] resolve Qwen3-VL GPTQModel quantized model loading failure (#28663) Signed-off-by: GuanH Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- vllm/model_executor/models/qwen3_vl.py | 4 +++- vllm/model_executor/models/utils.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index faeb9f81d961..f1c020ab5813 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1138,7 +1138,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.quant_config = quant_config - self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix) + self.model = Qwen3LLMModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) if get_pp_group().is_last_rank: if config.tie_word_embeddings: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index f14b79f2886c..e5663c8a057a 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -117,9 +117,10 @@ class AutoWeightsLoader: environment variable `VLLM_LOGGING_LEVEL=DEBUG`. """ - # Models trained using early version ColossalAI - # may include these tensors in checkpoint. Skip them. + # Models trained using early version ColossalAI or quantized by + # GPTQModel may include these tensors in checkpoint. Skip them. ROTARY_EMBEDS_UNUSED_WEIGHTS = [ + "rotary_pos_emb.inv_freq", "rotary_emb.inv_freq", "rotary_emb.cos_cached", "rotary_emb.sin_cached", From fd4555089a7ea3094499d9a6a9cec1c1b6903674 Mon Sep 17 00:00:00 2001 From: Andrey Khalyavin Date: Fri, 14 Nov 2025 21:58:18 +0300 Subject: [PATCH 072/578] [BugFix] Fix misprint introduced by modular_kernel refactoring. (#28728) Signed-off-by: Andrey Khalyavin --- vllm/model_executor/layers/fused_moe/modular_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a3142f37053f..093affe51f50 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1060,7 +1060,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]: global_num_experts=global_num_experts, expert_map=expert_map, a1q_scale=_slice_scales(a1q_scale, s, e), - a2_scale=_slice_scales(self.fused_experts.a2_scale, e, e), + a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e), workspace13=workspace13, workspace2=workspace2, expert_tokens_meta=c_expert_tokens_meta, From 8977ffb5e6428a3e682d47d9ca8342ccab9916f8 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 14 Nov 2025 11:06:01 -0800 Subject: [PATCH 073/578] [ROCm][Bugfix] Fix compilation errors with fused_qknorm_rope_kernel.cu (#28682) Signed-off-by: Sage Moore --- csrc/fused_qknorm_rope_kernel.cu | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu index 83017250ebcd..baff8363162e 100644 --- a/csrc/fused_qknorm_rope_kernel.cu +++ b/csrc/fused_qknorm_rope_kernel.cu @@ -37,6 +37,16 @@ #ifdef USE_ROCM #define FINAL_MASK 0xffffffffffffffffULL + + #if defined(HIP_VERSION) && HIP_VERSION < 70000000 +// On ROCm versions before 7.0, __syncwarp isn't defined. The below +// implementation is copy/pasted from the implementation in ROCm 7.0 +__device__ inline void __syncwarp() { + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); + __builtin_amdgcn_wave_barrier(); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); +} + #endif #else #define FINAL_MASK 0xffffffff #endif From f08eab2acc17da9e86d20673bd801659ca912749 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 14 Nov 2025 15:29:55 -0500 Subject: [PATCH 074/578] [CI] Fix macos smoke test uv cache issue (#28736) Signed-off-by: mgoin --- .github/workflows/macos-smoke-test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index f56fdc0dbe79..8d40aa587bf0 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -11,9 +11,12 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v4 + - uses: astral-sh/setup-uv@v7 with: enable-cache: true + cache-dependency-glob: | + requirements/**/*.txt + pyproject.toml python-version: '3.12' - name: Install dependencies From 0de4f217abe2c73ce6df52743365302466f7bc00 Mon Sep 17 00:00:00 2001 From: Marcin Ostrowski Date: Fri, 14 Nov 2025 22:13:53 +0100 Subject: [PATCH 075/578] [Bugfix] TypeError: 'NoneType' object is not callable (#27410) Signed-off-by: Marcin Ostrowski --- tests/v1/core/test_kv_cache_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index df6a5f109874..24611a4aaa1b 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -55,7 +55,7 @@ def _auto_init_hash_fn(request): hash_fn: Callable if "hash_fn" in request.fixturenames: - hash_fn = init_none_hash(request.getfixturevalue("hash_fn")) + hash_fn = request.getfixturevalue("hash_fn") else: hash_fn = sha256 init_none_hash(hash_fn) From 5a84b76b86e03694d612afc8f0225512d9b4ddc9 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:34:18 -0500 Subject: [PATCH 076/578] [ROCm][CI/Build] Change install location of uv (#28741) Signed-off-by: Gregory Shtrasberg --- docker/Dockerfile.rocm | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 137452cad2c1..731a97d93da1 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" # Install UV -RUN curl -LsSf https://astral.sh/uv/install.sh | sh - -# Activate virtual environment and add uv to PATH -ENV PATH="/root/.local/bin:$PATH" +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 From 2e0ad629b0422358d424e1fcfddeb22d102936e8 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Fri, 14 Nov 2025 14:11:10 -0800 Subject: [PATCH 077/578] Avoid bytecode hook and simplify TorchCompileWrapperWithCustomDipatch (#25110) Signed-off-by: Laith Sakka --- .../compile/piecewise/test_multiple_graphs.py | 11 +- tests/compile/piecewise/test_simple.py | 3 + tests/compile/piecewise/test_toy_llama.py | 9 +- tests/compile/test_wrapper.py | 155 +++++++++--- .../multimodal/generation/test_qwen2_5_vl.py | 10 + tests/v1/e2e/test_spec_decode.py | 8 + vllm/compilation/decorators.py | 234 +++++++++--------- vllm/compilation/wrapper.py | 212 ++++++++++------ vllm/envs.py | 6 + vllm/v1/worker/tpu_model_runner.py | 10 +- 10 files changed, 422 insertions(+), 236 deletions(-) diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index 64d626bae483..6d3788af9de0 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -22,6 +22,8 @@ from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils.torch_utils import is_torch_equal_or_newer +from ...utils import create_new_process_for_each_test + # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -193,7 +195,14 @@ def run_model( @pytest.mark.parametrize("use_inductor_graph_partition", [False, True]) -def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): +@pytest.mark.parametrize("use_bytecode_hook", [True, False]) +@create_new_process_for_each_test("spawn") +def test_multi_graph_piecewise_compile( + use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch +): + # Set the environment variable for this test + monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0") + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available in PyTorch 2.9+") diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index a48af8a8952a..e258133ab50a 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -21,6 +21,8 @@ from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils.torch_utils import is_torch_equal_or_newer +from ...utils import create_new_process_for_each_test + # This import automatically registers `torch.ops.silly.attention` from ..silly_attention import get_global_counter, reset_global_counter @@ -124,6 +126,7 @@ def _run_simple_model( @pytest.mark.parametrize("use_inductor", [True, False]) @torch.inference_mode() +@create_new_process_for_each_test("spawn") def test_simple_piecewise_compile(use_inductor): _run_simple_model( splitting_ops=["silly::attention"], diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 92998ede1699..915fbc6ce7f3 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -29,6 +29,8 @@ from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils.torch_utils import is_torch_equal_or_newer +from ...utils import create_new_process_for_each_test + # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor: ("inductor", True), # Inductor, Inductor partition ], ) +@create_new_process_for_each_test("spawn") def test_toy_llama( backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path ): @@ -513,4 +516,8 @@ def benchmark(): if __name__ == "__main__": - benchmark() + # Protect against subprocess reimport when using spawn_new_process_for_each_test + import os + + if os.environ.get("RUNNING_IN_SUBPROCESS") != "1": + benchmark() diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index da0afd9eaa49..356cac7af258 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -2,59 +2,134 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import pytest import torch -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationMode +from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper +from vllm.config import ( + CompilationConfig, + CompilationMode, + VllmConfig, + set_current_vllm_config, +) class MyMod(torch.nn.Module): def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): - if cache is not None: - return x + cache - return x * 2 + if x.size()[0] >= 4: + return x * 2 + else: + return x * 100 -class MyWrapper(TorchCompileWrapperWithCustomDispatcher): +class MyWrapper(TorchCompileWithNoGuardsWrapper): def __init__(self, model): self.model = model - compiled_callable = torch.compile(self.forward, backend="eager") - super().__init__( - compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE - ) + super().__init__() - def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): + def forward(self, x: torch.Tensor): # type: ignore[override] # this is the function to be compiled - return self.model(x, cache) - - def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None): - # let torch.compile compile twice - if len(self.compiled_codes) == 2: - dispatch_id = 0 if cache is None else 1 - with self.dispatch_to_code(dispatch_id): - return self.forward(x, cache) - else: - return self.compiled_callable(x, cache) + return self.model(x) + +@pytest.mark.parametrize("use_bytecode_hook", [True, False]) +def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch): + """Test basic functionality of TorchCompileWithNoGuardsWrapper.""" + # Set the environment variable for this test + monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0") -def test_torch_compile_wrapper(): - mod = MyMod() - wrappers = [] - for i in range(3): + # Create a proper vLLM config instead of mocking + vllm_config = VllmConfig() + vllm_config.compilation_config = CompilationConfig() + vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE + vllm_config.compilation_config.backend = "inductor" + + # Test DYNAMO_TRACE_ONCE + with set_current_vllm_config(vllm_config): torch._dynamo.reset() + mod = MyMod() + wrapper = MyWrapper(mod) + + # First call should trigger compilation + x = torch.tensor([1, 2, 3, 4]) + torch._dynamo.mark_dynamic(x, 0) + + result1 = wrapper(x) + expected1 = torch.tensor([2, 4, 6, 8]) + assert torch.allclose(result1, expected1), ( + f"Expected {expected1}, got {result1}" + ) + + # Second call should use compiled code + x2 = torch.tensor([1, 2, 3]) + result2 = wrapper(x2) + expected2 = torch.tensor([2, 4, 6]) + assert torch.allclose(result2, expected2), ( + f"Expected {expected2}, got {result2}" + ) + + # without the wrapper result would be different. + result3 = mod(x2) + expected3 = torch.tensor([100, 200, 300]) + + assert torch.allclose(result3, expected3), ( + f"Expected {result3}, got {expected3}" + ) + + # with STOCK_TORCH_COMPILE we do not remove guards. + vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE + torch._dynamo.reset() + with set_current_vllm_config(vllm_config): + mod = MyMod() wrapper = MyWrapper(mod) - wrappers.append(wrapper) - x = torch.tensor([1]) - wrapper(x, None) # profile run, compile - # create a cache tensor - cache = torch.tensor([2]) - wrapper(x, cache) # warm up with cache, recompile - - # for new input, dispatch to the compiled code directly - new_x = torch.tensor([3]) - assert wrapper(new_x, None).item() == 6 # dispatch to the first compiled code - assert wrapper(new_x, cache).item() == 5 # dispatch to the second compiled code - - for wrapper in wrappers: - # make sure they have independent compiled codes - assert len(wrapper.compiled_codes) == 2 + + # First call should trigger compilation + x = torch.tensor([1, 2, 3, 4]) + torch._dynamo.mark_dynamic(x, 0) + + result1 = wrapper(x) + expected1 = torch.tensor([2, 4, 6, 8]) + assert torch.allclose(result1, expected1), ( + f"Expected {expected1}, got {result1}" + ) + + # Second call should triger another compilation + x2 = torch.tensor([1, 2, 3]) + result2 = wrapper(x2) + expected2 = torch.tensor([100, 200, 300]) + assert torch.allclose(result2, expected2), ( + f"Expected {expected2}, got {result2}" + ) + + # NO_COMPILATION level not supported. + vllm_config.compilation_config.mode = None + torch._dynamo.reset() + with set_current_vllm_config(vllm_config): + torch._dynamo.reset() + mod = MyMod() + + try: + wrapper = MyWrapper(mod) + except Exception: + return + raise AssertionError("expected an exception to be raised") + + +if __name__ == "__main__": + # Run with both parameter values + + class MockMonkeypatch: + def setenv(self, name, value): + os.environ[name] = value + + mp = MockMonkeypatch() + + print("Testing with VLLM_USE_BYTECODE_HOOK=False") + test_torch_compile_wrapper(False, mp) + + print("Testing with VLLM_USE_BYTECODE_HOOK=True") + test_torch_compile_wrapper(True, mp) + + print("All tests passed!") diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py index 6b009075abfa..3ba665710af4 100644 --- a/tests/models/multimodal/generation/test_qwen2_5_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py @@ -34,6 +34,7 @@ def qwen2_5_vl_chat_template(*query): @pytest.mark.parametrize("num_frames", [16]) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("use_bytecode_hook", [True, False]) def test_qwen2_5_vl_evs_functionality( vllm_runner, video_assets, @@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality( num_frames: int, dtype: str, max_tokens: int, + use_bytecode_hook: bool, + monkeypatch, ) -> None: """Test EVS (Efficient Video Sampling) functionality with different pruning rates. """ + # Set the environment variable for this test + monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0") # Sample frames from video assets sampled_vids = [ @@ -86,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality( @pytest.mark.parametrize("num_frames", [16]) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("use_bytecode_hook", [True, False]) def test_qwen2_5_vl_evs_batched_videos( vllm_runner, video_assets, @@ -94,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos( num_frames: int, dtype: str, max_tokens: int, + use_bytecode_hook: bool, + monkeypatch, ) -> None: """Test EVS functionality with batched videos. @@ -102,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos( 2. Both pruning configurations work with multiple videos 3. The model doesn't crash when processing multiple videos simultaneously """ + # Set the environment variable for this test + monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0") # Sample frames from video assets sampled_vids = [ sample_frames_from_video(asset.np_ndarrays, num_frames) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 6cffaafb127e..03396270a31c 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -75,6 +75,14 @@ def model_name(): return "meta-llama/Llama-3.1-8B-Instruct" +@pytest.fixture(autouse=True) +def reset_torch_dynamo(): + """Reset torch dynamo cache before each test""" + yield + # Cleanup after test + torch._dynamo.reset() + + @pytest.mark.parametrize( "speculative_config", [ diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 0946fa69171b..e325bca73abb 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -17,7 +17,7 @@ import vllm.envs as envs from vllm.compilation.counter import compilation_counter -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper from vllm.config import ( CompilationMode, VllmConfig, @@ -246,14 +246,14 @@ def _support_torch_compile( """ A decorator to add support for compiling the forward method of a class. """ - if TorchCompileWrapperWithCustomDispatcher in cls.__bases__: + if TorchCompileWithNoGuardsWrapper in cls.__bases__: # support decorating multiple times return cls # take care of method resolution order # make sure super().__init__ is called on the base class - # other than TorchCompileWrapperWithCustomDispatcher - cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher,) + # other than TorchCompileWithNoGuardsWrapper + cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,) old_init = cls.__init__ @@ -290,12 +290,43 @@ def __init__( return compilation_counter.num_models_seen += 1 - TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_mode=vllm_config.compilation_config.mode - ) + self.compiled = False + TorchCompileWithNoGuardsWrapper.__init__(self) cls.__init__ = __init__ + def _mark_dynamic_inputs(mod, *args, **kwargs): + sig = inspect.signature(mod.__class__.forward) + bound_args = sig.bind(mod, *args, **kwargs) + bound_args.apply_defaults() + for k, dims in dynamic_arg_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + dims = [dims] if isinstance(dims, int) else dims + if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.mark_dynamic(arg, dims) + elif isinstance(arg, IntermediateTensors): + for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.mark_dynamic(tensor, dims) + else: + raise ValueError( + "Unsupported dynamic dimensions" + f" {dims} for argument {k} with type {type(arg)}." + ) + if mark_unbacked_dims: + for k, dims in mark_unbacked_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + dims = [dims] if isinstance(dims, int) else dims + if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.decorators.mark_unbacked(arg, dims) + def __call__(self, *args, **kwargs): # torch.compiler.is_compiling() means we are inside the compilation # e.g. TPU has the compilation logic in model runner, so we don't @@ -303,6 +334,7 @@ def __call__(self, *args, **kwargs): if self.do_not_compile or torch.compiler.is_compiling(): return self.forward(*args, **kwargs) + # if aot_compiled_fn is set, just call it. if getattr(self, "aot_compiled_fn", None) is not None: return self.aot_compiled_fn(self, *args, **kwargs) @@ -362,120 +394,84 @@ def __call__(self, *args, **kwargs): ) return self.aot_compiled_fn(self, *args, **kwargs) + if self.compiled: + assert not envs.VLLM_USE_AOT_COMPILE + return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) + + # This is the path for the first compilation. + # the first compilation needs to have dynamic shapes marked - if len(self.compiled_codes) < 1: - sig = inspect.signature(self.__class__.forward) - bound_args = sig.bind(self, *args, **kwargs) - bound_args.apply_defaults() - for k, dims in dynamic_arg_dims.items(): - arg = bound_args.arguments.get(k) - if arg is not None: - dims = [dims] if isinstance(dims, int) else dims - if isinstance(arg, torch.Tensor): - # In case dims is specified with negative indexing - dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] - torch._dynamo.mark_dynamic(arg, dims) - elif isinstance(arg, IntermediateTensors): - for tensor in arg.tensors.values(): - # In case dims is specified with negative indexing - dims = [ - tensor.ndim + dim if dim < 0 else dim for dim in dims - ] - torch._dynamo.mark_dynamic(tensor, dims) - else: - raise ValueError( - "Unsupported dynamic dimensions" - f" {dims} for argument {k} with type {type(arg)}." - ) - if mark_unbacked_dims: - for k, dims in mark_unbacked_dims.items(): - arg = bound_args.arguments.get(k) - if arg is not None: - dims = [dims] if isinstance(dims, int) else dims - if isinstance(arg, torch.Tensor): - # In case dims is specified with negative indexing - dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] - torch._dynamo.decorators.mark_unbacked(arg, dims) - # here, it is the starting point of the `torch.compile` process - start_monitoring_torch_compile(self.vllm_config) - logger.debug("Start compiling function %s", self.original_code_object) - - # if we don't use custom dispatcher, we can directly call the - # compiled function and let torch.compile handle the dispatching, - # with the overhead of guard evaluation and recompilation. - if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher: - # it seems Dynamo reuse the compilation across instances, - # while we need to make sure the compiled code is not reused. - # we need to control all the compilation of the model. - torch._dynamo.eval_frame.remove_from_cache(self.original_code_object) - - # collect all relevant files traced by Dynamo, - # so that the compilation cache can trigger re-compilation - # properly when any of these files change. - - # 1. the file containing the top-level forward function - self.vllm_config.compilation_config.traced_files.add( - self.original_code_object.co_filename - ) + _mark_dynamic_inputs(self, *args, **kwargs) - # 2. every time Dynamo sees a function call, it will inline - # the function by calling InliningInstructionTranslator.inline_call_ - # we hijack this function to know all the functions called - # during Dynamo tracing, and their corresponding files - inline_call = InliningInstructionTranslator.inline_call_ - - def patched_inline_call(self_): - code = self_.f_code - self.vllm_config.compilation_config.traced_files.add(code.co_filename) - return inline_call(self_) - - # Disable the C++ compilation of symbolic shape guards. C++-fication - # of symbolic shape guards can improve guard overhead. But, since - # vllm skip guards anyways, setting this flag to False can improve - # compile time. - dynamo_config_patches = {} - try: - _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards - dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False - except AttributeError: - # Note: this config is not available in torch 2.6, we can skip - # if the config doesn't exist - logger.debug("enable_cpp_symbolic_shape_guards config not available") - - with ( - patch.object( - InliningInstructionTranslator, "inline_call_", patched_inline_call - ), - torch._dynamo.config.patch(**dynamo_config_patches), - maybe_use_cudagraph_partition_wrapper(self.vllm_config), - _torch27_patch_tensor_subclasses(), - ): - if envs.VLLM_USE_AOT_COMPILE: - self.aot_compiled_fn = self.aot_compile(*args, **kwargs) - output = self.aot_compiled_fn(self, *args, **kwargs) - assert aot_compilation_path is not None - assert cache_dir is not None - try: - os.makedirs(cache_dir, exist_ok=True) - self.aot_compiled_fn.save_compiled_function( - aot_compilation_path - ) - except Exception as e: - logger.warning( - "Cannot save aot compilation to path %s, error: %s", - aot_compilation_path, - str(e), - ) - else: - output = self.compiled_callable(*args, **kwargs) - return output - - # usually, capturing the model once is enough, and then we can - # dispatch to the compiled code directly, without going through - # the Dynamo guard mechanism. - with self.dispatch_to_code(0): - model_output = self.forward(*args, **kwargs) - return model_output + # here, it is the starting point of the `torch.compile` process + start_monitoring_torch_compile(self.vllm_config) + original_code_object = self.original_code_object() + logger.debug("Start compiling function %s", original_code_object) + + # we do not want tp delete the original code object entries since + # we depend on them now to look up cached compiled functions. + # torch._dynamo.eval_frame.remove_from_cache(original_code_object) + + # collect all relevant files traced by Dynamo, + # so that the compilation cache can trigger re-compilation + # properly when any of these files change. + + # 1. the file containing the top-level forward function + self.vllm_config.compilation_config.traced_files.add( + original_code_object.co_filename + ) + + # 2. every time Dynamo sees a function call, it will inline + # the function by calling InliningInstructionTranslator.inline_call_ + # we hijack this function to know all the functions called + # during Dynamo tracing, and their corresponding files + inline_call = InliningInstructionTranslator.inline_call_ + + def patched_inline_call(self_): + code = self_.f_code + self.vllm_config.compilation_config.traced_files.add(code.co_filename) + return inline_call(self_) + + # Disable the C++ compilation of symbolic shape guards. C++-fication + # of symbolic shape guards can improve guard overhead. But, since + # vllm skip guards anyways, setting this flag to False can improve + # compile time. + dynamo_config_patches = {} + try: + _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards + dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False + except AttributeError: + # Note: this config is not available in torch 2.6, we can skip + # if the config doesn't exist + logger.debug("enable_cpp_symbolic_shape_guards config not available") + + with ( + patch.object( + InliningInstructionTranslator, "inline_call_", patched_inline_call + ), + torch._dynamo.config.patch(**dynamo_config_patches), + maybe_use_cudagraph_partition_wrapper(self.vllm_config), + _torch27_patch_tensor_subclasses(), + ): + if envs.VLLM_USE_AOT_COMPILE: + self.aot_compiled_fn = self.aot_compile(*args, **kwargs) + output = self.aot_compiled_fn(self, *args, **kwargs) + assert aot_compilation_path is not None + assert cache_dir is not None + try: + os.makedirs(cache_dir, exist_ok=True) + self.aot_compiled_fn.save_compiled_function(aot_compilation_path) + except Exception as e: + logger.warning( + "Cannot save aot compilation to path %s, error: %s", + aot_compilation_path, + str(e), + ) + else: + output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) + + self.compiled = True + return output cls.__call__ = __call__ return cls diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 4d26619bd128..493e57f97f0f 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -4,11 +4,11 @@ import os import sys from abc import abstractmethod -from collections.abc import Callable from contextlib import contextmanager from types import CodeType import torch +import torch._C._dynamo.guards import vllm.envs as envs from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config @@ -17,88 +17,153 @@ logger = init_logger(__name__) -class TorchCompileWrapperWithCustomDispatcher: +def _noop_add_global_state_guard(self, *args, **kwargs): + """No-op to skip the GLOBAL_STATE guard entirely""" + pass + + +def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs): + """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely""" + pass + + +@contextmanager +def _compilation_context(): + """Context manager for compilation settings and patches. + + This manager: + 1. Sets higher dynamo cache limits for compilation. (Needed for + qwen2_5_vl see test_qwen2_5_vl_evs_functionality). + Generally a recompilation can happen whenever we use a new + backend instance in torch.compile. + 2. Patches out add_global_state_guard to skip GLOBAL_STATE guards + 3. Patches out add_torch_function_mode_stack_guard to skip + TORCH_FUNCTION_MODE_STACK guards. + 4. Restores everything when compilation completes """ - A wrapper class for torch.compile, with a custom dispatch logic. - Subclasses should: - 1. Implement the forward method - 2. Implement the dispatch logic in the __call__ method - It can use `self.compiled_codes` to access the compiled bytecode, - and `with self.dispatch_to_code(index):` to dispatch to - the compiled code. - 3. Implement the `__init__` method to determine how to call - `torch.compile` over the forward method. + # Save original values + original_global_state_guard = ( + torch._C._dynamo.guards.GuardManager.add_global_state_guard + ) + original_torch_function_mode_stack_guard = ( + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard + ) + original_cache_size = torch._dynamo.config.cache_size_limit + original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit + + try: + # Set higher cache limits for compilation + torch._dynamo.config.cache_size_limit = 2048 + torch._dynamo.config.accumulated_cache_size_limit = 8192 + + # Patch guard manager + torch._C._dynamo.guards.GuardManager.add_global_state_guard = ( + _noop_add_global_state_guard + ) + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = ( + _noop_add_torch_function_mode_stack_guard + ) + yield + finally: + # Restore original values + torch._C._dynamo.guards.GuardManager.add_global_state_guard = ( + original_global_state_guard + ) + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = ( + original_torch_function_mode_stack_guard + ) + torch._dynamo.config.cache_size_limit = original_cache_size + torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache + + +class TorchCompileWithNoGuardsWrapper: """ + A wrapper class for torch.compile, it ensures that all guards are dropped + when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE. + When guards are dropped, the first time __call__ is invoked, a single + compilation is triggered. Dynamo should never be traced again after that + since we drop all guards. + """ + + def __init__(self): + self.compiled = False - def __init__( - self, - compiled_callable: Callable | None = None, - compilation_mode: CompilationMode = CompilationMode.NONE, - ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config - if compiled_callable is None: - # default compilation settings - # compiling the forward method - - backend = vllm_config.compilation_config.init_backend(vllm_config) - options = None - if isinstance(backend, str) and backend == "inductor": - options = ( - get_current_vllm_config().compilation_config.inductor_compile_config - ) - if envs.VLLM_USE_AOT_COMPILE: - options = options or {} - # This effectively drop all the guards. - # We need this because bytecode hook is not used any more to - # drop guards in the AOT compile mode. - options["guard_filter_fn"] = lambda guards: [False for _ in guards] - if hasattr(torch._dynamo.config, "enable_aot_compile"): - torch._dynamo.config.enable_aot_compile = True - else: - msg = "torch._dynamo.config.enable_aot_compile is not " - msg += "available. AOT compile is disabled and please " - msg += "upgrade PyTorch version to use AOT compile." - logger.warning(msg) - - compiled_callable = torch.compile( - self.forward, fullgraph=True, backend=backend, options=options - ) - - self.compiled_callable = compiled_callable - self.original_code_object = self.__class__.forward.__code__ - self.compiled_codes: list[CodeType] = [] - torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) - - # read the env var to determine whether to use the custom dispatcher - # subclasses can use this to switch between the custom dispatcher - # and the default Dynamo guard mechanism. - self.use_custom_dispatcher: bool = ( - compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE + mode = vllm_config.compilation_config.mode + if mode is None: + raise RuntimeError("Compilation mode cannot be NO_COMPILATION") + + backend = vllm_config.compilation_config.init_backend(vllm_config) + options = {} + + if isinstance(backend, str) and backend == "inductor": + options = vllm_config.compilation_config.inductor_compile_config + + if mode != CompilationMode.STOCK_TORCH_COMPILE: + # Drop all the guards. + options["guard_filter_fn"] = lambda x: [False for _ in x] + + if envs.VLLM_USE_AOT_COMPILE: + if hasattr(torch._dynamo.config, "enable_aot_compile"): + torch._dynamo.config.enable_aot_compile = True + else: + msg = "torch._dynamo.config.enable_aot_compile is not " + msg += "available. AOT compile is disabled and please " + msg += "upgrade PyTorch version to use AOT compile." + logger.warning(msg) + + self._compiled_callable = torch.compile( + self.forward, + fullgraph=True, + dynamic=False, + backend=backend, + options=options, ) + if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE: + torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) + self._compiled_bytecode = None + def aot_compile(self, *args, **kwargs): - if not hasattr(self.compiled_callable, "aot_compile"): + if not hasattr(self._compiled_callable, "aot_compile"): raise RuntimeError( "aot_compile is not supported by the current configuration. " + "Please make sure torch.compile is enabled with the latest " + f"version of PyTorch (current using torch: {torch.__version__})" ) - return self.compiled_callable.aot_compile((args, kwargs)) + return self._compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile mode. - NOTE: this function can have additional arguments beyond the forward - method, for directly dispatching to the compiled code. - """ - return self.compiled_callable(*args, **kwargs) + if envs.VLLM_USE_BYTECODE_HOOK: + if ( + self.vllm_config.compilation_config.mode + == CompilationMode.STOCK_TORCH_COMPILE + ): + return self._compiled_callable(*args, **kwargs) + + if not self._compiled_bytecode: + # Make sure a compilation is triggered by clearing dynamo + # cache. + torch._dynamo.eval_frame.remove_from_cache(self.original_code_object()) + return self._compiled_callable(*args, **kwargs) + else: + with self._dispatch_to_compiled_code(): + return self.forward(*args, **kwargs) + else: + with _compilation_context(): + return self._compiled_callable(*args, **kwargs) @abstractmethod def forward(self, *args, **kwargs): ... + def original_code_object(self) -> CodeType: + """Return the original code object of the forward method.""" + return self.__class__.forward.__code__ + def bytecode_hook(self, old_code: CodeType, new_code: CodeType): """Hook to save the compiled bytecode for direct execution.""" - if old_code is not self.original_code_object: + if old_code is not self.original_code_object(): return # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25 frame = sys._getframe() @@ -114,7 +179,7 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): if frame.f_locals["self"] is not self: return - self.compiled_codes.append(new_code) + self._compiled_bytecode = new_code path = self.vllm_config.compile_debug_dump_path() if path: @@ -153,16 +218,21 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): raise RuntimeError(msg) @contextmanager - def dispatch_to_code(self, index: int): - """Context manager to dispatch to the compiled code. + def _dispatch_to_compiled_code(self): + # noqa: E501 + """ + Context manager to dispatch to internally compiled code for torch<2.8. Why does this work? Because Dynamo guarantees that the compiled bytecode has exactly the same arguments, cell variables, and free variables as the original code. Therefore we can directly switch the code object in the function and call it. - See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 - for more details. - """ - self.__class__.forward.__code__ = self.compiled_codes[index] - yield - self.__class__.forward.__code__ = self.original_code_object + See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details. + """ # noqa: E501 line too long + original = self.original_code_object() + assert self._compiled_bytecode is not None + self.__class__.forward.__code__ = self._compiled_bytecode + try: + yield + finally: + self.__class__.forward.__code__ = original diff --git a/vllm/envs.py b/vllm/envs.py index 0530938c32f9..7987e5fb83fd 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -92,6 +92,7 @@ VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False VLLM_USE_AOT_COMPILE: bool = False + VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False VLLM_TORCH_PROFILER_WITH_STACK: bool = True VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False @@ -556,6 +557,11 @@ def get_vllm_port() -> int | None: # compilation is done in warmup phase and the compilation will be # reused in subsequent calls. "VLLM_USE_AOT_COMPILE": use_aot_compile, + # Feature flag to enable/disable bytecode in + # TorchCompileWithNoGuardsWrapper. + "VLLM_USE_BYTECODE_HOOK": lambda: bool( + int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1")) + ), # Force vllm to always load AOT compiled models from disk. Failure # to load will result in a hard error when this is enabled. # Will be ignored when VLLM_USE_AOT_COMPILE is disabled. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 0f90578671db..01490e0dfac9 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -21,7 +21,7 @@ from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import MLAAttention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper from vllm.config import ( ParallelConfig, VllmConfig, @@ -1895,12 +1895,14 @@ def reset_dynamo_cache(self): compiled_model = self.model.get_language_model().model else: compiled_model = self.model.model - if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher): + if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper): logger.info("Clear dynamo cache and cached dynamo bytecode.") torch._dynamo.eval_frame.remove_from_cache( - compiled_model.original_code_object + compiled_model.original_code_object() ) - compiled_model.compiled_codes.clear() + # Reset the wrapper to re-initialize. + compiled_model.compiled = False + TorchCompileWithNoGuardsWrapper.__init__(compiled_model) @torch.compile(backend="openxla", fullgraph=True, dynamic=False) def select_hidden_states(self, hidden_states, indices_do_sample): From e5c78956c0c576d8f7230c29550ff09ffff0c064 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:13:46 -0500 Subject: [PATCH 078/578] [Bugfix] Fix incorrect use of hidden_states for shared_experts due to do_naive_dispatch_combine (#28740) Signed-off-by: Alexander Matveev --- vllm/model_executor/layers/fused_moe/layer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index aed8245cbd83..023132acfed3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1749,14 +1749,16 @@ def forward_impl( with sp_ctx: if do_naive_dispatch_combine: - hidden_states, router_logits = get_ep_group().dispatch( + hidden_states_combined, router_logits = get_ep_group().dispatch( hidden_states, router_logits, self.is_sequence_parallel ) # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, - x=hidden_states, + x=hidden_states_combined + if do_naive_dispatch_combine + else hidden_states, router_logits=router_logits, top_k=self.top_k, renormalize=self.renormalize, From bf3ffb61e61525cce5fdec8a249f8114a0c0bfcc Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 14 Nov 2025 17:14:46 -0500 Subject: [PATCH 079/578] [Bugfix] Fix ChunkedLocalAttention CUDA Graph setting (#28739) Signed-off-by: Benjamin Chislett --- .../layers/chunked_local_attention.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py index f144e8435b6c..48fcc6fa736b 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/attention/layers/chunked_local_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -from typing import ClassVar import torch @@ -12,11 +11,16 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backends.utils import ( AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata, make_local_attention_virtual_batches, subclass_attention_backend, ) -from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, KVCacheSpec +from vllm.v1.kv_cache_interface import ( + AttentionSpec, + ChunkedLocalAttentionSpec, + KVCacheSpec, +) from ..layer import Attention @@ -30,9 +34,18 @@ def create_chunked_local_attention_backend( prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_" underlying_builder = underlying_attn_backend.get_builder_cls() + assert issubclass(underlying_builder, AttentionMetadataBuilder) class ChunkedLocalAttentionBuilder(underlying_builder): # type: ignore - _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER + @classmethod + def get_cudagraph_support( + cls: type["AttentionMetadataBuilder"], + vllm_config: VllmConfig, + kv_cache_spec: AttentionSpec, + ) -> AttentionCGSupport: + # Explicit override in case the underlying builder specialized this getter. + # @override omitted only because of mypy limitation due to type variable. + return AttentionCGSupport.NEVER def build( self, From e0c910bb89e45f4a2a976dc3c76248bbdea854e0 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 14 Nov 2025 23:55:42 +0100 Subject: [PATCH 080/578] [Hybrid] [Kernel] Fix chunk scan kernel when BLOCK_SIZE_DSTATE > 128 (#28295) Signed-off-by: Thomas Parnell --- vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index e5a5c9dd6f71..661c884627b0 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -245,7 +245,7 @@ def _chunk_scan_fwd_kernel( ) if not HAS_INITSTATES and (seq_idx != seq_idx_prev): prev_states = tl.zeros( - (BLOCK_SIZE_DSTATE, BLOCK_SIZE_K), dtype=C_ptr.dtype.element_ty + (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty ) else: prev_states = tl.load( From ba041d980b5677a0ab6cebb3c7fe24cfe27bac66 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 14 Nov 2025 17:26:39 -0600 Subject: [PATCH 081/578] [Log] Save profiler results to file instead of stdout (#28144) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- vllm/v1/worker/gpu_worker.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2b9d8bb2f25e..283e3744bcf6 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -596,14 +596,19 @@ def profile(self, is_start: bool = True): self.profiler.start() else: self.profiler.stop() - # only print profiler results on rank 0 - if ( - isinstance(self.profiler, torch.profiler.profile) - and self.local_rank == 0 - ): - print( - self.profiler.key_averages().table(sort_by="self_cuda_time_total") - ) + if isinstance(self.profiler, torch.profiler.profile): + rank = self.local_rank + profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" + sort_key = "self_cuda_time_total" + table = self.profiler.key_averages().table(sort_by=sort_key) + + with open(profiler_out_file, "w") as f: + print(table, file=f) + + # only print profiler results on rank 0 + if rank == 0: + print(table) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1, uniform_decode=True) From 75f01b9d3c3a40e52e2fa4a2c9efc92cf45a88fc Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:53:21 -0500 Subject: [PATCH 082/578] [ROCm][CI/Build] Upgrade to ROCm 7.1 and AITER main (#28753) Signed-off-by: Gregory Shtrasberg --- docker/Dockerfile.rocm_base | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 19f7fa7e1468..df4f9b6c26e7 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete ARG TRITON_BRANCH="57c693b6" ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG PYTORCH_BRANCH="1c57644d" @@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="9716b1b8" +ARG AITER_BRANCH="59bd8ff2" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV AITER_ROCM_ARCH=gfx942;gfx950 +# Required for RCCL in ROCm7.1 +ENV HSA_NO_SCRATCH_RECLAIM=1 + ARG PYTHON_VERSION=3.12 RUN mkdir -p /app From 58e61e56b744da109269586fe45ecc47b10dca5f Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 14 Nov 2025 16:01:09 -0800 Subject: [PATCH 083/578] [Test] Rework e2e async scheduling tests (#28744) Signed-off-by: Nick Hill --- tests/v1/e2e/test_async_scheduling.py | 358 +++++++++++++++++++------- 1 file changed, 268 insertions(+), 90 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 444afd5196dd..dbe403ece051 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from itertools import repeat from typing import Any import pytest @@ -8,126 +9,291 @@ from vllm import SamplingParams from vllm.logprobs import Logprob from vllm.sampling_params import StructuredOutputsParams +from vllm.v1.metrics.reader import Metric from ...conftest import VllmRunner from ...models.utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" +MTP_MODEL = "XiaomiMiMo/MiMo-7B-Base" -@dynamo_config.patch(cache_size_limit=16) -def test_preempt_and_async_scheduling_e2e( - sample_json_schema, monkeypatch: pytest.MonkeyPatch -): - """Test consistency of combos of async scheduling, preemption, - uni/multiproc executor, and various sampling parameters - including structured outputs.""" +first_prompt = ( + "The following numbers of the sequence " + + ", ".join(str(i) for i in range(10)) + + " are:" +) +example_prompts = [first_prompt, "In one word, the capital of France is "] + [ + f"Tell me about the number {i}: " for i in range(32) +] - first_prompt = ( - "The following numbers of the sequence " - + ", ".join(str(i) for i in range(10)) - + " are:" - ) - example_prompts = [first_prompt, "In one word, the capital of France is "] + [ - f"Tell me about the number {i}: " for i in range(32) - ] +default_params = dict( + temperature=0.0, # greedy + max_tokens=20, +) - sampling_param_tests: list[dict[str, Any]] = [ + +def test_without_spec_decoding( + sample_json_schema, + monkeypatch: pytest.MonkeyPatch, +): + """Test consistency of combos of async scheduling, preemption, + uni/multiproc executor, prefill chunking.""" + struct_outputs = StructuredOutputsParams(json=sample_json_schema) + test_sampling_params: list[dict[str, Any]] = [ dict(), # dict(min_tokens=20), dict(presence_penalty=-1.0), dict(bad_words=["the", " the"]), dict(logprobs=2), dict(logprobs=2, presence_penalty=-1.0), - dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)), + dict(structured_outputs=struct_outputs), dict( - structured_outputs=StructuredOutputsParams(json=sample_json_schema), + structured_outputs=struct_outputs, logprobs=2, presence_penalty=-1.0, ), ] - default_params = dict( - temperature=0.0, # greedy - max_tokens=20, + # test_preemption, executor, async_scheduling, + # spec_config, test_prefill_chunking + test_configs = [ + (False, "mp", False, None, False), + (True, "mp", False, None, True), + (False, "mp", True, None, False), + (False, "uni", True, None, False), + (True, "mp", True, None, False), + (True, "uni", True, None, False), + (False, "mp", True, None, True), + # Async scheduling + preemption + chunked prefill needs to be fixed (WIP) + # (True, "mp", True, None, True), + # (True, "uni", True, None, True), + ] + + run_tests( + monkeypatch, + MODEL, + test_configs, + test_sampling_params, ) + +@pytest.mark.skip("MTP model too big to run in fp32 in CI") +def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): + """Test consistency and acceptance rates with some different combos of + preemption, executor, async scheduling, prefill chunking, + spec decoding model length. + """ + + spec_config = { + "method": "mtp", + "num_speculative_tokens": 2, + } + spec_config_short = spec_config | {"max_model_len": 50} + + # test_preemption, executor, async_scheduling, + # spec_config, test_prefill_chunking + test_configs = [ + (False, "mp", False, None, False), + (False, "mp", False, spec_config, False), + (True, "mp", False, spec_config, True), + (True, "uni", False, spec_config_short, True), + (False, "mp", True, spec_config, False), + (True, "mp", True, spec_config, False), + (False, "mp", True, spec_config_short, True), + (True, "uni", True, spec_config, False), + (True, "uni", True, spec_config_short, False), + # Async scheduling + preemption + chunked prefill needs to be fixed (WIP) + # (True, "mp", True, spec_config, True), + # (True, "uni", True, spec_config_short, True), + ] + + run_tests( + monkeypatch, + MTP_MODEL, + test_configs, + [{}], + ) + + +@dynamo_config.patch(cache_size_limit=16) +def run_tests( + monkeypatch: pytest.MonkeyPatch, + model: str, + test_configs: list[tuple], + test_sampling_params: list[dict[str, Any]], +): + """Test consistency of combos of async scheduling, preemption, + uni/multiproc executor with spec decoding.""" + with monkeypatch.context() as m: + # avoid precision errors m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") # m.setenv("VLLM_BATCH_INVARIANT", "1") + outputs: list[tuple[str, list, list]] = [] + for n, ( + test_preemption, + executor, + async_scheduling, + spec_config, + test_prefill_chunking, + ) in enumerate(test_configs, 1): + test_str = f"{n}/{len(test_configs)}" + test_results = run_test( + model, + test_str, + test_sampling_params, + test_preemption, + executor, + async_scheduling, + spec_config, + test_prefill_chunking=test_prefill_chunking, + ) + outputs.append(test_results) + + baseline_config, baseline_tests, _ = outputs[0] + _, _, baseline_acceptances = next( + (o for o in outputs if o[2] is not None), (None, None, None) + ) - outputs: list[tuple[str, list]] = [] - for test_preemption in [False, True]: - for executor in ["mp", "uni"]: - for async_scheduling in [False, True]: - cache_arg: dict[str, Any] = ( - dict(num_gpu_blocks_override=32) - if test_preemption - else dict(gpu_memory_utilization=0.7) - ) - test_config = ( - f"executor={executor}, preemption={test_preemption}," - f" async_sched={async_scheduling}" - ) - print("-" * 80) - print(f"---- TESTING: {test_config}") - print("-" * 80) - with VllmRunner( - MODEL, - max_model_len=512, - enforce_eager=True, - async_scheduling=async_scheduling, - distributed_executor_backend=executor, - dtype="float32", # avoid precision errors - **cache_arg, - ) as vllm_model: - results = [] - for override_params in sampling_param_tests: - print(f"----------- RUNNING PARAMS: {override_params}") - results.append( - vllm_model.generate( - example_prompts, - sampling_params=SamplingParams( - **default_params, **override_params - ), - return_logprobs=True, - ) - ) - - if not outputs: - # First check that the different parameter configs - # actually result in different output. - for (other_test_outs, other_test_logprobs), params in zip( - results[1:], sampling_param_tests[1:] - ): - with pytest.raises(AssertionError): - check_outputs_equal( - outputs_0_lst=results[0][0], - outputs_1_lst=other_test_outs, - name_0=f"baseline params={params}", - name_1=f"other params={params}", - ) - assert _all_logprobs_match( - results[0][1], other_test_logprobs - ) - - outputs.append((test_config, results)) - - baseline_config, baseline_tests = outputs[0] - - for test_config, test_outputs in outputs[1:]: - for (base_outs, base_logprobs), (test_outs, test_logprobs), params in zip( - baseline_tests, test_outputs, sampling_param_tests + print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}") + + failure = None + for test_config, test_outputs, test_acceptance_rates in outputs[1:]: + for (base_outs, base_logprobs), base_acceptance_rate, ( + test_outs, + test_logprobs, + ), test_acceptance_rate, params in zip( + baseline_tests, + baseline_acceptances or repeat(None), + test_outputs, + test_acceptance_rates or repeat(None), + test_sampling_params, ): - check_outputs_equal( - outputs_0_lst=base_outs, - outputs_1_lst=test_outs, - name_0=f"baseline=[{baseline_config}], params={params}", - name_1=f"config=[{test_config}], params={params}", + try: + check_outputs_equal( + outputs_0_lst=base_outs, + outputs_1_lst=test_outs, + name_0=f"baseline=[{baseline_config}], params={params}", + name_1=f"config=[{test_config}], params={params}", + ) + assert _all_logprobs_match(base_logprobs, test_logprobs) + + if ( + base_acceptance_rate is not None + and test_acceptance_rate is not None + ): + if "spec_mml=None" in test_config: + # because the acceptance rate can vary, we use a looser + # tolerance here. + assert ( + pytest.approx(test_acceptance_rate, rel=5e-2) + == base_acceptance_rate + ) + else: + # Currently the reported acceptance rate is expected to be + # lower when we skip drafting altogether. + assert test_acceptance_rate > 0.05 + print( + f"PASSED: config=[{test_config}], params={params}" + f" accept_rate={test_acceptance_rate}" + ) + except AssertionError as e: + print( + f"FAILED: config=[{test_config}], params={params}" + f" accept_rate={test_acceptance_rate}" + ) + if failure is None: + failure = e + + if failure is not None: + raise failure + + +def run_test( + model: str, + test_str: str, + sampling_param_tests: list[dict[str, Any]], + test_preemption: bool, + executor: str, + async_scheduling: bool, + spec_config: dict[str, Any] | None, + test_prefill_chunking: bool, +): + spec_decoding = spec_config is not None + cache_arg: dict[str, Any] = ( + dict(num_gpu_blocks_override=32) + if test_preemption + else dict(gpu_memory_utilization=0.9) + ) + spec_mml = (spec_config or {}).get("max_model_len") + test_config = ( + f"executor={executor}, preemption={test_preemption}, " + f"async_sched={async_scheduling}, " + f"chunk_prefill={test_prefill_chunking}, " + f"spec_decoding={spec_decoding}, spec_mml={spec_mml}" + ) + print("-" * 80) + print(f"---- TESTING {test_str}: {test_config}") + print("-" * 80) + with VllmRunner( + model, + max_model_len=512, + enable_chunked_prefill=test_prefill_chunking, + max_num_batched_tokens=48 if test_prefill_chunking else None, + # enforce_eager=True, + async_scheduling=async_scheduling, + distributed_executor_backend=executor, + dtype="float32", # avoid precision errors + speculative_config=spec_config, + disable_log_stats=False, + **cache_arg, + ) as vllm_model: + results = [] + acceptance_rates: list[float] | None = [] if spec_decoding else None + for override_params in sampling_param_tests: + metrics_before = vllm_model.llm.get_metrics() + print(f"----------- RUNNING PARAMS: {override_params}") + results.append( + vllm_model.generate( + example_prompts, + sampling_params=SamplingParams( + **default_params, + **override_params, + ), + return_logprobs=True, + ) ) - assert _all_logprobs_match(base_logprobs, test_logprobs) + metrics_after = vllm_model.llm.get_metrics() + if acceptance_rates is not None: + acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after) + acceptance_rates.append(acceptance_rate) + print(f"ACCEPTANCE RATE {acceptance_rate}") + + if test_preemption: + preemptions = _get_count( + metrics_before, + metrics_after, + "vllm:num_preemptions", + ) + assert preemptions > 0, "preemption test had no preemptions" + + if len(results) > 1: + # First check that the different parameter configs + # actually result in different output. + for (other_test_outs, other_test_logprobs), params in zip( + results[1:], sampling_param_tests[1:] + ): + with pytest.raises(AssertionError): + check_outputs_equal( + outputs_0_lst=results[0][0], + outputs_1_lst=other_test_outs, + name_0=f"baseline params={params}", + name_1=f"other params={params}", + ) + assert _all_logprobs_match(results[0][1], other_test_logprobs) - print(f"PASSED: config=[{test_config}], params={params}") + return test_config, results, acceptance_rates def _all_logprobs_match(req_a, req_b) -> bool: @@ -149,3 +315,15 @@ def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> boo and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6) for a, b in ((lps_a[x], lps_b[x]) for x in lps_a) ) + + +def _get_acceptance_rate(before: list[Metric], after: list[Metric]) -> float: + draft = _get_count(before, after, "vllm:spec_decode_num_draft_tokens") + accept = _get_count(before, after, "vllm:spec_decode_num_accepted_tokens") + return accept / draft if draft > 0 else 0.0 + + +def _get_count(before: list[Metric], after: list[Metric], name: str) -> int: + before_val = next(m.value for m in before if m.name == name) + after_val = next(m.value for m in after if m.name == name) + return after_val - before_val From 186352b2703652141df75bc2c012a784706e8572 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Fri, 14 Nov 2025 16:04:04 -0800 Subject: [PATCH 084/578] [Core] Performance: Use list[np.ndarray] instead of list[list[int]] for output tokens for GC optimization (#26368) Signed-off-by: Jialin Ouyang --- tests/v1/core/test_async_scheduler.py | 3 +- tests/v1/core/test_scheduler.py | 76 +++++++++++++++----------- tests/v1/kv_connector/unit/utils.py | 3 +- tests/v1/spec_decode/test_eagle.py | 5 +- tests/v1/spec_decode/test_ngram.py | 18 +++--- vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/outputs.py | 2 +- vllm/v1/sample/rejection_sampler.py | 8 +-- vllm/v1/spec_decode/eagle.py | 7 +-- vllm/v1/spec_decode/ngram_proposer.py | 6 +- vllm/v1/spec_decode/suffix_decoding.py | 10 ++-- vllm/v1/worker/gpu_model_runner.py | 36 +++++++----- 12 files changed, 102 insertions(+), 76 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index e0645ed43015..1d80ee987591 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque +import numpy as np import pytest from vllm.v1.core.sched.output import SchedulerOutput @@ -21,7 +22,7 @@ def _make_model_runner_output( return ModelRunnerOutput( req_ids=req_ids, req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)}, - sampled_token_ids=[[i] for i in range(len(req_ids))], + sampled_token_ids=[np.array([i]) for i in range(len(req_ids))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 04e738293cd7..6d95c29ec1ab 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -3,6 +3,7 @@ import dataclasses from unittest.mock import Mock +import numpy as np import pytest import torch @@ -169,7 +170,7 @@ def test_schedule_partial_requests(): req_id_to_index=req_to_index, # Only the first request has a sampled token id because # the rest requests are still being prefilled. - sampled_token_ids=[[0], [], []], + sampled_token_ids=[np.array([0]), np.array([]), np.array([])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -216,7 +217,7 @@ def test_no_mm_input_chunking(): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[] for _ in range(len(requests))], + sampled_token_ids=[np.array([]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -276,7 +277,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[] for _ in range(len(requests))], + sampled_token_ids=[np.array([]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -300,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], + sampled_token_ids=[np.array([0]), np.array([0])] + + [np.array([]) for _ in range(len(requests) - 2)], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -347,8 +349,8 @@ def test_stop_via_update_from_output(): req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[ - [EOS_TOKEN_ID], - [10, 11], + np.array([EOS_TOKEN_ID]), + np.array([10, 11]), ], # First request hits EOS, second continues logprobs=None, prompt_logprobs_dict={}, @@ -392,7 +394,10 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token + sampled_token_ids=[ + np.array([10, 42, 12]), + np.array([13, 14]), + ], # First request hits stop token logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -436,7 +441,10 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens + sampled_token_ids=[ + np.array([10, 11, 12]), + np.array([13]), + ], # First request exceeds max_tokens logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -475,7 +483,7 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -616,7 +624,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -633,7 +641,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -670,7 +678,7 @@ def test_preempt_during_execution(): model_runner_output0 = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -687,7 +695,7 @@ def test_preempt_during_execution(): model_runner_output1 = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[42]], + sampled_token_ids=[np.array([42])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -704,14 +712,18 @@ def test_preempt_during_execution(): @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", [ - ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match - ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch - ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])), # multiple sequences - ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence - ([[]], [[5]], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])), # early mismatch + ( + [[1, 2], [3]], + [np.array([1, 2, 5]), np.array([3, 4])], + (2, 3, 3, [2, 1]), + ), # multiple sequences + ([[1]], [np.array([1, 2])], (1, 1, 1, [1])), # single token sequence + ([[]], [np.array([5])], (0, 0, 0, [0])), # empty sequence ( [[1, 2, 3], [4, 5, 6]], - [[1, 2, 7], [4, 8]], + [np.array([1, 2, 7]), np.array([4, 8])], (2, 6, 3, [2, 1, 0]), ), # multiple mismatches ], @@ -745,7 +757,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[0] for _ in range(len(requests))], + sampled_token_ids=[np.array([0]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -972,7 +984,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1025,7 +1037,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1088,7 +1100,7 @@ def test_external_prefix_cache_metrics(): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=[r.request_id for r in requests], req_id_to_index={r.request_id: i for i, r in enumerate(requests)}, - sampled_token_ids=[[1000]] * NUM_REQUESTS, + sampled_token_ids=[np.array([1000])] * NUM_REQUESTS, logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1154,7 +1166,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1239,7 +1251,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1332,7 +1344,7 @@ def make_output(scheduler: Scheduler): return ModelRunnerOutput( req_ids=[req.request_id for req in scheduler.running], req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)}, - sampled_token_ids=[[1000]] * len(scheduler.running), + sampled_token_ids=[np.array([1000])] * len(scheduler.running), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1749,7 +1761,7 @@ def test_priority_scheduling_preemption(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[[100] for _ in low_priority_requests], + sampled_token_ids=[np.array([100]) for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1818,7 +1830,7 @@ def test_priority_scheduling_no_preemption_when_space_available(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[[100] for _ in low_priority_requests], + sampled_token_ids=[np.array([100]) for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2064,7 +2076,7 @@ def test_priority_scheduling_heap_property(): model_output = ModelRunnerOutput( req_ids=[req.req_id], req_id_to_index={req.req_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2150,7 +2162,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[request_low.request_id], req_id_to_index={request_low.request_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2181,7 +2193,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[100] for _ in requests], + sampled_token_ids=[np.array([100]) for _ in requests], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2207,7 +2219,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[], [100]], + sampled_token_ids=[np.array([]), np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index f35f91bb3adf..c248104d5b5e 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -7,6 +7,7 @@ from itertools import chain, count from typing import Any +import numpy as np import torch from vllm import SamplingParams @@ -228,7 +229,7 @@ def create_model_runner_output( # Make sampled tokens. sampled_token = EOS_TOKEN_ID if use_eos else token_id - sampled_token_ids = [[sampled_token] for _ in req_ids] + sampled_token_ids = [np.array([sampled_token]) for _ in req_ids] kv_connector_output = ( None diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 89d0ec769ac0..421da5241555 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -3,6 +3,7 @@ from unittest import mock +import numpy as np import pytest import torch @@ -112,7 +113,9 @@ def test_prepare_next_token_ids(): sampled_token_ids_tensor = torch.tensor( sampled_token_ids, dtype=torch.int32, device=device ) - sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids] + sampled_token_ids_cpu = [ + np.array([i for i in seq if i != -1]) for seq in sampled_token_ids + ] expected_next_token_ids_cpu = [1, 4, 30, 40] expected_next_token_ids_tensor = torch.tensor( diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 692c39282c37..563bc1d957f4 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match. token_ids_cpu = np.array([[1, 2, 3, 4, 5]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram but match for 3-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # In this case, the proposer should return the 4-gram match. token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Match for 2-gram and 3-gram, but not 4-gram. token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Multiple 3-gram matched, but always pick the first one. token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # check empty input token_ids_cpu = np.array([[]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # second request has 3 tokens and no match. Padded with -1 for max len 5 token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0], [1]], + sampled_token_ids=[np.array([0]), np.array([1])], req_ids=["0", "1"], num_tokens_no_spec=np.array([5, 3]), token_ids_cpu=token_ids_cpu, @@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: input_2[:3] = [4, 5, 6] token_ids_cpu = np.array([input_1, input_2]) result = ngram_proposer.propose( - sampled_token_ids=[[0], [1]], + sampled_token_ids=[np.array([0]), np.array([1])], req_ids=["0", "1"], num_tokens_no_spec=np.array([len(input_1), 3]), token_ids_cpu=token_ids_cpu, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ba7ad0c09173..c640c40a455d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1010,8 +1010,8 @@ def update_from_output( continue req_index = model_runner_output.req_id_to_index[req_id] - generated_token_ids = ( - sampled_token_ids[req_index] if sampled_token_ids else [] + generated_token_ids: list[int] = ( + sampled_token_ids[req_index].tolist() if sampled_token_ids else [] ) scheduled_spec_token_ids = ( diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index e32d5bb608b1..60ee9671e497 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -158,7 +158,7 @@ class ModelRunnerOutput: # num_generated_tokens is the number of tokens # generated in the current step. It can be different for # each request due to speculative/jump decoding. - sampled_token_ids: list[list[int]] + sampled_token_ids: list[np.ndarray] # [num_reqs, max_num_logprobs + 1] # [num_reqs, max_num_logprobs + 1] diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 926305d25f56..f31a0cddda9a 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -3,6 +3,7 @@ from dataclasses import replace +import numpy as np import torch import torch.nn as nn @@ -204,7 +205,7 @@ def _get_logprobs_tensors( def parse_output( output_token_ids: torch.Tensor, vocab_size: int, - ) -> list[list[int]]: + ) -> list[np.ndarray]: """Parse the output of the rejection sampler. Args: output_token_ids: The sampled token IDs in shape @@ -220,10 +221,7 @@ def parse_output( valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & ( output_token_ids_np < vocab_size ) - outputs = [ - row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np) - ] - return outputs + return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)] def apply_logits_processors( self, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index beef5203e039..f3b34544f8d9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -484,7 +484,7 @@ def propose( def prepare_next_token_ids_cpu( self, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], requests: dict[str, CachedRequestState], gpu_input_batch: InputBatch, num_scheduled_tokens: dict[str, int], @@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu( req_ids = gpu_input_batch.req_ids next_token_ids: list[int] = [] for i, token_ids in enumerate(sampled_token_ids): - if token_ids: + if token_ids.shape[0] > 0: # Common case. next_token_id = token_ids[-1] else: @@ -510,10 +510,9 @@ def prepare_next_token_ids_cpu( seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id] next_token_id = req_state.get_token_id(seq_len) next_token_ids.append(next_token_id) - next_token_ids = torch.tensor( + return torch.tensor( next_token_ids, dtype=torch.int32, device=self.input_ids.device ) - return next_token_ids def prepare_next_token_ids_padded( self, diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index e2f83cb24aa9..378937dba988 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig): # Trigger Numba JIT compilation for N-gram proposer. # This usually takes less than 1 second. self.propose( - [[]] * 1024, + [np.array([])] * 1024, [""] * 1024, np.zeros(1024, dtype=np.int32), np.zeros((1024, self.max_model_len), dtype=np.int32), @@ -131,7 +131,7 @@ def batch_propose( def propose( self, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], req_ids: list[str], num_tokens_no_spec: np.ndarray, token_ids_cpu: np.ndarray, @@ -140,7 +140,7 @@ def propose( # find which requests need ngram proposals valid_ngram_requests = [] for i, sampled_ids in enumerate(sampled_token_ids): - num_sampled_ids = len(sampled_ids) + num_sampled_ids = sampled_ids.shape[0] if not num_sampled_ids: # Skip speculative decoding. continue diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py index 049e335db325..d76e0ffe778d 100644 --- a/vllm/v1/spec_decode/suffix_decoding.py +++ b/vllm/v1/spec_decode/suffix_decoding.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import numpy as np + from vllm.config import VllmConfig from vllm.v1.worker.gpu_input_batch import InputBatch @@ -32,16 +34,16 @@ def __init__(self, vllm_config: VllmConfig): def propose( self, input_batch: InputBatch, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], ) -> list[list[int]]: """ Propose speculative tokens for each request in the input batch. Suffix Decoding will speculate a dynamic number of tokens for each request every decoding step, so each entry in the returned list may have different lengths. """ - draft_token_ids: list[list[int]] = [] + draft_token_ids: list[np.ndarray] = [] for i, sampled_ids in enumerate(sampled_token_ids): - if not sampled_ids: + if sampled_ids.shape[0] == 0: # Skip speculative decoding for partial prefills. draft_token_ids.append([]) continue @@ -70,7 +72,7 @@ def propose( self.suffix_cache.start_request(req_id, prompt_token_ids) # Append the newly sampled ids to the suffix cache for this request. - self.suffix_cache.add_active_response(req_id, sampled_ids) + self.suffix_cache.add_active_response(req_id, sampled_ids.tolist()) # Suffix decoding only uses the most recent tokens up to max_tree_depth, so # we extract the pattern from the end of the input. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9b3e5b668aab..d0d6164180e6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -216,9 +216,11 @@ def get_output(self) -> ModelRunnerOutput: del self._logprobs_tensors del self._sampled_token_ids - valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() + valid_sampled_token_ids: list[np.ndarray] = [ + row for row in self.sampled_token_ids_cpu.numpy() + ] for i in self._invalid_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids[i] = np.array([]) output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids @@ -2339,7 +2341,7 @@ def _bookkeeping_sync( ) -> tuple[ dict[str, int], LogprobsLists | None, - list[list[int]], + list[np.ndarray], dict[str, LogprobsTensors | None], list[str], dict[str, int], @@ -2365,6 +2367,7 @@ def _bookkeeping_sync( num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids invalid_req_indices = [] + valid_sampled_token_ids: list[np.ndarray] if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -2379,7 +2382,7 @@ def _bookkeeping_sync( ) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[int(i)].clear() + valid_sampled_token_ids[int(i)] = np.array([]) else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() @@ -2407,19 +2410,24 @@ def _bookkeeping_sync( [0] if spec_decode_metadata and logprobs_tensors else None ) for req_idx in range(num_sampled_tokens): + sampled_ids: np.ndarray | None if self.use_async_scheduling: - sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None + sampled_ids = ( + np.array([-1]) if req_idx not in invalid_req_indices_set else None + ) else: sampled_ids = valid_sampled_token_ids[req_idx] - num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 + num_sampled_ids: int = ( + sampled_ids.shape[0] if sampled_ids is not None else 0 + ) if cu_num_accepted_tokens is not None: cu_num_accepted_tokens.append( cu_num_accepted_tokens[-1] + num_sampled_ids ) - if not sampled_ids: + if sampled_ids is None or num_sampled_ids == 0: continue start_idx = self.input_batch.num_tokens_no_spec[req_idx] @@ -2761,7 +2769,9 @@ def sample_tokens( with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) - def propose_draft_token_ids(sampled_token_ids): + def propose_draft_token_ids( + sampled_token_ids: torch.Tensor | list[np.ndarray], + ) -> None: assert spec_decode_common_attn_metadata is not None with record_function_or_nullcontext("gpu_model_runner: draft"): self._draft_token_ids = self.propose_draft_token_ids( @@ -2883,14 +2893,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None: def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", - sampled_token_ids: torch.Tensor | list[list[int]], + sampled_token_ids: torch.Tensor | list[np.ndarray], sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, aux_hidden_states: list[torch.Tensor] | None, spec_decode_metadata: SpecDecodeMetadata | None, common_attn_metadata: CommonAttentionMetadata, - ) -> list[list[int]] | torch.Tensor: + ) -> torch.Tensor | list[list[int]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if self.speculative_config.method == "ngram": assert isinstance(sampled_token_ids, list) @@ -2922,7 +2932,7 @@ def propose_draft_token_ids( for num_draft, tokens in zip( spec_decode_metadata.num_draft_tokens, sampled_token_ids ): - indices.append(offset + len(tokens) - 1) + indices.append(offset + tokens.shape[0] - 1) offset += num_draft + 1 indices = torch.tensor(indices, device=self.device) hidden_states = sample_hidden_states[indices] @@ -4862,7 +4872,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return kv_cache_spec - def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]: # This is a short term mitigation for issue mentioned in # https://github.com/vllm-project/vllm/issues/22754. # `tolist` would trigger a cuda wise stream sync, which @@ -4875,4 +4885,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: pinned.copy_(sampled_token_ids, non_blocking=True) self.transfer_event.record() self.transfer_event.synchronize() - return pinned.tolist() + return [row for row in pinned.numpy()] From 9fc81ec765aa0daa6f704023c0f902a0da653b72 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Fri, 14 Nov 2025 16:58:32 -0800 Subject: [PATCH 085/578] [TPU] Fix import error in tpu launch (#28758) Signed-off-by: Qiliang Cui --- vllm/platforms/tpu.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 4ab037fdb77e..c1218801bc07 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -9,20 +9,25 @@ from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger -from vllm.sampling_params import SamplingParams, SamplingType from .interface import Platform, PlatformEnum if TYPE_CHECKING: + from typing import TypeAlias + from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.config.cache import BlockSize from vllm.pooling_params import PoolingParams + from vllm.sampling_params import SamplingParams + + ParamsType: TypeAlias = SamplingParams | PoolingParams else: BlockSize = None VllmConfig = None PoolingParams = None AttentionBackendEnum = None + ParamsType = None logger = init_logger(__name__) @@ -203,10 +208,12 @@ def get_device_communicator_cls(cls) -> str: def validate_request( cls, prompt: PromptType, - params: SamplingParams | PoolingParams, + params: ParamsType, processed_inputs: ProcessorInputs, ) -> None: """Raises if this request is unsupported on this platform""" + from vllm.sampling_params import SamplingParams, SamplingType + if ( isinstance(params, SamplingParams) and params.sampling_type == SamplingType.RANDOM_SEED From f05d474c8a08659cc1610a85de7e7a7095494a52 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 15 Nov 2025 03:45:11 +0000 Subject: [PATCH 086/578] [Model][Qwen3VL] Use `mm_position` to compute mrope positions (#28730) Signed-off-by: Lukas Geiger Co-authored-by: Cyrus Leung --- vllm/model_executor/models/qwen3_vl.py | 87 +++++++++----------------- 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f1c020ab5813..fa6b71bf9268 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -24,7 +24,7 @@ # limitations under the License. """Inference-only Qwen3VL model compatible with HuggingFace weights.""" -from collections.abc import Callable, Iterable, Mapping, Sequence +from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from functools import partial from itertools import islice from typing import Any @@ -1412,72 +1412,47 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: ) return mm_input_by_modality + def iter_mm_grid_hw( + self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] + ) -> Iterator[tuple[int, int, int]]: + video_token_id = self.config.video_token_id + spatial_merge_size = self.config.vision_config.spatial_merge_size + for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset): + offset = mm_feature.mm_position.offset + if mm_feature.modality == "image": + t, h, w = mm_feature.data["image_grid_thw"].data.tolist() + assert t == 1, f"Image must have 1 frame, got {t}" + yield offset, h // spatial_merge_size, w // spatial_merge_size + elif mm_feature.modality == "video": + t, h, w = mm_feature.data["video_grid_thw"].data.tolist() + llm_grid_h = h // spatial_merge_size + llm_grid_w = w // spatial_merge_size + for _ in range(t): + offset = input_tokens.index(video_token_id, offset) + yield offset, llm_grid_h, llm_grid_w + offset += llm_grid_h * llm_grid_w + else: + raise ValueError(f"Unsupported modality: {mm_feature.modality}") + def get_mrope_input_positions( self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec], ) -> tuple[torch.Tensor, int]: - kwargs = MultiModalFeatureSpec.gather_kwargs( - mm_features, - {"image_grid_thw", "video_grid_thw"}, - ) - image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])] - video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])] - - video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)] - - hf_config = self.config - image_token_id = hf_config.image_token_id - video_token_id = hf_config.video_token_id - vision_start_token_id = hf_config.vision_start_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - - input_tokens_array = np.array(input_tokens) - vision_start_mask = input_tokens_array == vision_start_token_id - vision_tokens = input_tokens_array[vision_start_mask.nonzero()[0] + 1] - image_nums = np.count_nonzero(vision_tokens == image_token_id) - video_nums = np.count_nonzero(vision_tokens == video_token_id) - llm_pos_ids_list: list = [] - + llm_pos_ids_list = [] st = 0 - remain_images, remain_videos = image_nums, video_nums - - image_index, video_index = 0, 0 - for _ in range(image_nums + video_nums): - if image_token_id in input_tokens and remain_images > 0: - ed_image = input_tokens.index(image_token_id, st) - else: - ed_image = len(input_tokens) + 1 - if video_token_id in input_tokens and remain_videos > 0: - ed_video = input_tokens.index(video_token_id, st) - else: - ed_video = len(input_tokens) + 1 - if ed_image < ed_video: - t, h, w = image_grid_thw[image_index] - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = video_grid_thw[video_index] - video_index += 1 - remain_videos -= 1 - ed = ed_video - - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - text_len = ed - st - + for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw( + input_tokens, mm_features + ): + text_len = offset - st st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 llm_pos_ids_list.append( np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx ) - grid_indices = np.indices((llm_grid_t, llm_grid_h, llm_grid_w)) - llm_pos_ids_list.append(grid_indices.reshape(3, -1) + text_len + st_idx) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w + grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1) + llm_pos_ids_list.append(grid_indices + text_len + st_idx) + st = offset + llm_grid_h * llm_grid_w if st < len(input_tokens): st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 From edfe49818959b1a1a0b7e8ef7ffcdc39d9903ec6 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 14 Nov 2025 22:51:05 -0500 Subject: [PATCH 087/578] [Bugfix] Build hadacore kernels on >SM90 (#28748) Signed-off-by: mgoin --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dcc44be87e55..3a37040edbf1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # Hadacore kernels - cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") if(HADACORE_ARCHS) set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") set_gencode_flags_for_srcs( From ac86bff8cb53939117a6a460af1a6c3fea829a56 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 14 Nov 2025 20:24:00 -0800 Subject: [PATCH 088/578] =?UTF-8?q?Revert=20"[Core]=20Performance:=20Use?= =?UTF-8?q?=20list[np.ndarray]=20instead=20of=20list[list=E2=80=A6=20(#287?= =?UTF-8?q?73)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/v1/core/test_async_scheduler.py | 3 +- tests/v1/core/test_scheduler.py | 76 +++++++++++--------------- tests/v1/kv_connector/unit/utils.py | 3 +- tests/v1/spec_decode/test_eagle.py | 5 +- tests/v1/spec_decode/test_ngram.py | 18 +++--- vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/outputs.py | 2 +- vllm/v1/sample/rejection_sampler.py | 8 ++- vllm/v1/spec_decode/eagle.py | 7 ++- vllm/v1/spec_decode/ngram_proposer.py | 6 +- vllm/v1/spec_decode/suffix_decoding.py | 10 ++-- vllm/v1/worker/gpu_model_runner.py | 36 +++++------- 12 files changed, 76 insertions(+), 102 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index 1d80ee987591..e0645ed43015 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque -import numpy as np import pytest from vllm.v1.core.sched.output import SchedulerOutput @@ -22,7 +21,7 @@ def _make_model_runner_output( return ModelRunnerOutput( req_ids=req_ids, req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)}, - sampled_token_ids=[np.array([i]) for i in range(len(req_ids))], + sampled_token_ids=[[i] for i in range(len(req_ids))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 6d95c29ec1ab..04e738293cd7 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -3,7 +3,6 @@ import dataclasses from unittest.mock import Mock -import numpy as np import pytest import torch @@ -170,7 +169,7 @@ def test_schedule_partial_requests(): req_id_to_index=req_to_index, # Only the first request has a sampled token id because # the rest requests are still being prefilled. - sampled_token_ids=[np.array([0]), np.array([]), np.array([])], + sampled_token_ids=[[0], [], []], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -217,7 +216,7 @@ def test_no_mm_input_chunking(): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[np.array([]) for _ in range(len(requests))], + sampled_token_ids=[[] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[np.array([]) for _ in range(len(requests))], + sampled_token_ids=[[] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[np.array([0]), np.array([0])] - + [np.array([]) for _ in range(len(requests) - 2)], + sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -349,8 +347,8 @@ def test_stop_via_update_from_output(): req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[ - np.array([EOS_TOKEN_ID]), - np.array([10, 11]), + [EOS_TOKEN_ID], + [10, 11], ], # First request hits EOS, second continues logprobs=None, prompt_logprobs_dict={}, @@ -394,10 +392,7 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[ - np.array([10, 42, 12]), - np.array([13, 14]), - ], # First request hits stop token + sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -441,10 +436,7 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[ - np.array([10, 11, 12]), - np.array([13]), - ], # First request exceeds max_tokens + sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -483,7 +475,7 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])], + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -624,7 +616,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -641,7 +633,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -678,7 +670,7 @@ def test_preempt_during_execution(): model_runner_output0 = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -695,7 +687,7 @@ def test_preempt_during_execution(): model_runner_output1 = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[np.array([42])], + sampled_token_ids=[[42]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -712,18 +704,14 @@ def test_preempt_during_execution(): @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", [ - ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])), # perfect match - ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])), # early mismatch - ( - [[1, 2], [3]], - [np.array([1, 2, 5]), np.array([3, 4])], - (2, 3, 3, [2, 1]), - ), # multiple sequences - ([[1]], [np.array([1, 2])], (1, 1, 1, [1])), # single token sequence - ([[]], [np.array([5])], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch + ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])), # multiple sequences + ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence + ([[]], [[5]], (0, 0, 0, [0])), # empty sequence ( [[1, 2, 3], [4, 5, 6]], - [np.array([1, 2, 7]), np.array([4, 8])], + [[1, 2, 7], [4, 8]], (2, 6, 3, [2, 1, 0]), ), # multiple mismatches ], @@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[np.array([0]) for _ in range(len(requests))], + sampled_token_ids=[[0] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[np.array([1000])] * len(req_ids), + sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[np.array([1000])] * len(req_ids), + sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics(): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=[r.request_id for r in requests], req_id_to_index={r.request_id: i for i, r in enumerate(requests)}, - sampled_token_ids=[np.array([1000])] * NUM_REQUESTS, + sampled_token_ids=[[1000]] * NUM_REQUESTS, logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[np.array([1000])] * len(req_ids), + sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[np.array([1000])] * len(req_ids), + sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler): return ModelRunnerOutput( req_ids=[req.request_id for req in scheduler.running], req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)}, - sampled_token_ids=[np.array([1000])] * len(scheduler.running), + sampled_token_ids=[[1000]] * len(scheduler.running), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[np.array([100]) for _ in low_priority_requests], + sampled_token_ids=[[100] for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[np.array([100]) for _ in low_priority_requests], + sampled_token_ids=[[100] for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property(): model_output = ModelRunnerOutput( req_ids=[req.req_id], req_id_to_index={req.req_id: 0}, - sampled_token_ids=[np.array([100])], + sampled_token_ids=[[100]], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[request_low.request_id], req_id_to_index={request_low.request_id: 0}, - sampled_token_ids=[np.array([100])], + sampled_token_ids=[[100]], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[np.array([100]) for _ in requests], + sampled_token_ids=[[100] for _ in requests], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[np.array([]), np.array([100])], + sampled_token_ids=[[], [100]], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index c248104d5b5e..f35f91bb3adf 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -7,7 +7,6 @@ from itertools import chain, count from typing import Any -import numpy as np import torch from vllm import SamplingParams @@ -229,7 +228,7 @@ def create_model_runner_output( # Make sampled tokens. sampled_token = EOS_TOKEN_ID if use_eos else token_id - sampled_token_ids = [np.array([sampled_token]) for _ in req_ids] + sampled_token_ids = [[sampled_token] for _ in req_ids] kv_connector_output = ( None diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 421da5241555..89d0ec769ac0 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -3,7 +3,6 @@ from unittest import mock -import numpy as np import pytest import torch @@ -113,9 +112,7 @@ def test_prepare_next_token_ids(): sampled_token_ids_tensor = torch.tensor( sampled_token_ids, dtype=torch.int32, device=device ) - sampled_token_ids_cpu = [ - np.array([i for i in seq if i != -1]) for seq in sampled_token_ids - ] + sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids] expected_next_token_ids_cpu = [1, 4, 30, 40] expected_next_token_ids_tensor = torch.tensor( diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 563bc1d957f4..692c39282c37 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match. token_ids_cpu = np.array([[1, 2, 3, 4, 5]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram but match for 3-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # In this case, the proposer should return the 4-gram match. token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Match for 2-gram and 3-gram, but not 4-gram. token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Multiple 3-gram matched, but always pick the first one. token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # check empty input token_ids_cpu = np.array([[]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[np.array([0])], + sampled_token_ids=[[0]], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # second request has 3 tokens and no match. Padded with -1 for max len 5 token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[np.array([0]), np.array([1])], + sampled_token_ids=[[0], [1]], req_ids=["0", "1"], num_tokens_no_spec=np.array([5, 3]), token_ids_cpu=token_ids_cpu, @@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: input_2[:3] = [4, 5, 6] token_ids_cpu = np.array([input_1, input_2]) result = ngram_proposer.propose( - sampled_token_ids=[np.array([0]), np.array([1])], + sampled_token_ids=[[0], [1]], req_ids=["0", "1"], num_tokens_no_spec=np.array([len(input_1), 3]), token_ids_cpu=token_ids_cpu, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c640c40a455d..ba7ad0c09173 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1010,8 +1010,8 @@ def update_from_output( continue req_index = model_runner_output.req_id_to_index[req_id] - generated_token_ids: list[int] = ( - sampled_token_ids[req_index].tolist() if sampled_token_ids else [] + generated_token_ids = ( + sampled_token_ids[req_index] if sampled_token_ids else [] ) scheduled_spec_token_ids = ( diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 60ee9671e497..e32d5bb608b1 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -158,7 +158,7 @@ class ModelRunnerOutput: # num_generated_tokens is the number of tokens # generated in the current step. It can be different for # each request due to speculative/jump decoding. - sampled_token_ids: list[np.ndarray] + sampled_token_ids: list[list[int]] # [num_reqs, max_num_logprobs + 1] # [num_reqs, max_num_logprobs + 1] diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index f31a0cddda9a..926305d25f56 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -3,7 +3,6 @@ from dataclasses import replace -import numpy as np import torch import torch.nn as nn @@ -205,7 +204,7 @@ def _get_logprobs_tensors( def parse_output( output_token_ids: torch.Tensor, vocab_size: int, - ) -> list[np.ndarray]: + ) -> list[list[int]]: """Parse the output of the rejection sampler. Args: output_token_ids: The sampled token IDs in shape @@ -221,7 +220,10 @@ def parse_output( valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & ( output_token_ids_np < vocab_size ) - return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)] + outputs = [ + row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np) + ] + return outputs def apply_logits_processors( self, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f3b34544f8d9..beef5203e039 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -484,7 +484,7 @@ def propose( def prepare_next_token_ids_cpu( self, - sampled_token_ids: list[np.ndarray], + sampled_token_ids: list[list[int]], requests: dict[str, CachedRequestState], gpu_input_batch: InputBatch, num_scheduled_tokens: dict[str, int], @@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu( req_ids = gpu_input_batch.req_ids next_token_ids: list[int] = [] for i, token_ids in enumerate(sampled_token_ids): - if token_ids.shape[0] > 0: + if token_ids: # Common case. next_token_id = token_ids[-1] else: @@ -510,9 +510,10 @@ def prepare_next_token_ids_cpu( seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id] next_token_id = req_state.get_token_id(seq_len) next_token_ids.append(next_token_id) - return torch.tensor( + next_token_ids = torch.tensor( next_token_ids, dtype=torch.int32, device=self.input_ids.device ) + return next_token_ids def prepare_next_token_ids_padded( self, diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index 378937dba988..e2f83cb24aa9 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig): # Trigger Numba JIT compilation for N-gram proposer. # This usually takes less than 1 second. self.propose( - [np.array([])] * 1024, + [[]] * 1024, [""] * 1024, np.zeros(1024, dtype=np.int32), np.zeros((1024, self.max_model_len), dtype=np.int32), @@ -131,7 +131,7 @@ def batch_propose( def propose( self, - sampled_token_ids: list[np.ndarray], + sampled_token_ids: list[list[int]], req_ids: list[str], num_tokens_no_spec: np.ndarray, token_ids_cpu: np.ndarray, @@ -140,7 +140,7 @@ def propose( # find which requests need ngram proposals valid_ngram_requests = [] for i, sampled_ids in enumerate(sampled_token_ids): - num_sampled_ids = sampled_ids.shape[0] + num_sampled_ids = len(sampled_ids) if not num_sampled_ids: # Skip speculative decoding. continue diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py index d76e0ffe778d..049e335db325 100644 --- a/vllm/v1/spec_decode/suffix_decoding.py +++ b/vllm/v1/spec_decode/suffix_decoding.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import numpy as np - from vllm.config import VllmConfig from vllm.v1.worker.gpu_input_batch import InputBatch @@ -34,16 +32,16 @@ def __init__(self, vllm_config: VllmConfig): def propose( self, input_batch: InputBatch, - sampled_token_ids: list[np.ndarray], + sampled_token_ids: list[list[int]], ) -> list[list[int]]: """ Propose speculative tokens for each request in the input batch. Suffix Decoding will speculate a dynamic number of tokens for each request every decoding step, so each entry in the returned list may have different lengths. """ - draft_token_ids: list[np.ndarray] = [] + draft_token_ids: list[list[int]] = [] for i, sampled_ids in enumerate(sampled_token_ids): - if sampled_ids.shape[0] == 0: + if not sampled_ids: # Skip speculative decoding for partial prefills. draft_token_ids.append([]) continue @@ -72,7 +70,7 @@ def propose( self.suffix_cache.start_request(req_id, prompt_token_ids) # Append the newly sampled ids to the suffix cache for this request. - self.suffix_cache.add_active_response(req_id, sampled_ids.tolist()) + self.suffix_cache.add_active_response(req_id, sampled_ids) # Suffix decoding only uses the most recent tokens up to max_tree_depth, so # we extract the pattern from the end of the input. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0d6164180e6..9b3e5b668aab 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -216,11 +216,9 @@ def get_output(self) -> ModelRunnerOutput: del self._logprobs_tensors del self._sampled_token_ids - valid_sampled_token_ids: list[np.ndarray] = [ - row for row in self.sampled_token_ids_cpu.numpy() - ] + valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() for i in self._invalid_req_indices: - valid_sampled_token_ids[i] = np.array([]) + valid_sampled_token_ids[i].clear() output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids @@ -2341,7 +2339,7 @@ def _bookkeeping_sync( ) -> tuple[ dict[str, int], LogprobsLists | None, - list[np.ndarray], + list[list[int]], dict[str, LogprobsTensors | None], list[str], dict[str, int], @@ -2367,7 +2365,6 @@ def _bookkeeping_sync( num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids invalid_req_indices = [] - valid_sampled_token_ids: list[np.ndarray] if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -2382,7 +2379,7 @@ def _bookkeeping_sync( ) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[int(i)] = np.array([]) + valid_sampled_token_ids[int(i)].clear() else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() @@ -2410,24 +2407,19 @@ def _bookkeeping_sync( [0] if spec_decode_metadata and logprobs_tensors else None ) for req_idx in range(num_sampled_tokens): - sampled_ids: np.ndarray | None if self.use_async_scheduling: - sampled_ids = ( - np.array([-1]) if req_idx not in invalid_req_indices_set else None - ) + sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None else: sampled_ids = valid_sampled_token_ids[req_idx] - num_sampled_ids: int = ( - sampled_ids.shape[0] if sampled_ids is not None else 0 - ) + num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 if cu_num_accepted_tokens is not None: cu_num_accepted_tokens.append( cu_num_accepted_tokens[-1] + num_sampled_ids ) - if sampled_ids is None or num_sampled_ids == 0: + if not sampled_ids: continue start_idx = self.input_batch.num_tokens_no_spec[req_idx] @@ -2769,9 +2761,7 @@ def sample_tokens( with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) - def propose_draft_token_ids( - sampled_token_ids: torch.Tensor | list[np.ndarray], - ) -> None: + def propose_draft_token_ids(sampled_token_ids): assert spec_decode_common_attn_metadata is not None with record_function_or_nullcontext("gpu_model_runner: draft"): self._draft_token_ids = self.propose_draft_token_ids( @@ -2893,14 +2883,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None: def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", - sampled_token_ids: torch.Tensor | list[np.ndarray], + sampled_token_ids: torch.Tensor | list[list[int]], sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, aux_hidden_states: list[torch.Tensor] | None, spec_decode_metadata: SpecDecodeMetadata | None, common_attn_metadata: CommonAttentionMetadata, - ) -> torch.Tensor | list[list[int]]: + ) -> list[list[int]] | torch.Tensor: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if self.speculative_config.method == "ngram": assert isinstance(sampled_token_ids, list) @@ -2932,7 +2922,7 @@ def propose_draft_token_ids( for num_draft, tokens in zip( spec_decode_metadata.num_draft_tokens, sampled_token_ids ): - indices.append(offset + tokens.shape[0] - 1) + indices.append(offset + len(tokens) - 1) offset += num_draft + 1 indices = torch.tensor(indices, device=self.device) hidden_states = sample_hidden_states[indices] @@ -4872,7 +4862,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return kv_cache_spec - def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]: + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: # This is a short term mitigation for issue mentioned in # https://github.com/vllm-project/vllm/issues/22754. # `tolist` would trigger a cuda wise stream sync, which @@ -4885,4 +4875,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]: pinned.copy_(sampled_token_ids, non_blocking=True) self.transfer_event.record() self.transfer_event.synchronize() - return [row for row in pinned.numpy()] + return pinned.tolist() From 363aaeef0ff8511fd1466d41a2e027b22b28f39b Mon Sep 17 00:00:00 2001 From: Mohammad Othman <48595863+OthmanMohammad@users.noreply.github.com> Date: Sat, 15 Nov 2025 06:31:36 +0200 Subject: [PATCH 089/578] Fix IntermediateTensors initialization and add type hints (#28743) Signed-off-by: Mohammad Othman Co-authored-by: Mohammad Othman --- vllm/sequence.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 6bcc94ad5c62..6d20ca9aac22 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -60,12 +60,17 @@ class IntermediateTensors: tensors: dict[str, torch.Tensor] kv_connector_output: KVConnectorOutput | None - def __init__(self, tensors): + def __init__( + self, + tensors: dict[str, torch.Tensor], + kv_connector_output: KVConnectorOutput | None = None, + ) -> None: # manually define this function, so that # Dynamo knows `IntermediateTensors()` comes from this file. # Otherwise, dataclass will generate this function by evaluating # a string, and we will lose the information about the source file. self.tensors = tensors + self.kv_connector_output = kv_connector_output def __getitem__(self, key: str | slice): if isinstance(key, str): From c9e665852abbd42d7404a4f6dad7d47478ca95f8 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 14 Nov 2025 23:51:32 -0600 Subject: [PATCH 090/578] [NIXL] heterogeneous block_size support (#26759) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chendi Xue Signed-off-by: Chendi.Xue Co-authored-by: Nicolò Lucchesi --- .../nixl_integration/run_accuracy_test.sh | 4 + .../kv_connector/unit/test_nixl_connector.py | 3 + .../kv_connector/v1/nixl_connector.py | 309 ++++++++++++++---- 3 files changed, 257 insertions(+), 59 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index a9817313cf02..ebc8575e5b39 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2} +PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16} +DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) @@ -136,6 +138,7 @@ run_tests_for_model() { vllm serve $model_name \ --port $PORT \ --enforce-eager \ + --block-size ${PREFILL_BLOCK_SIZE} \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" @@ -177,6 +180,7 @@ run_tests_for_model() { vllm serve $model_name \ --port $PORT \ --enforce-eager \ + --block-size ${DECODE_BLOCK_SIZE} \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --kv-transfer-config '$KV_CONFIG'" diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 8e421717fea3..b7d7a10057b8 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -407,6 +407,7 @@ def _nixl_handshake( # `self.kv_cache_layout` is only forced to HND when vllm engine # is started. We mock HND here. kv_cache_layout="HND", + block_size=self.block_size, ), remote_tp_size=remote_tp_size, ) @@ -652,6 +653,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init): block_lens=worker.block_len_per_layer, attn_backend_name=worker.backend_name, kv_cache_layout=mismatched_layout, + block_size=worker.block_size, ) with pytest.raises(RuntimeError): @@ -706,6 +708,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental( block_lens=[i * 2 for i in worker.block_len_per_layer], attn_backend_name=worker.backend_name, kv_cache_layout="HND", + block_size=worker.block_size, ) # We don't check layout for homogeneous TP and MLA for now, as the diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 3d4547c51453..a70c98b63713 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -108,6 +108,7 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata): block_lens: list[int] attn_backend_name: str kv_cache_layout: str + block_size: int @dataclass @@ -709,6 +710,9 @@ def split_k_and_v(self) -> bool: self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first ) + block_size: int + remote_block_size: dict[EngineId, int] + def tp_ratio( self, remote_tp_size: int, @@ -725,6 +729,19 @@ def tp_ratio( ) return self.tp_size // remote_tp_size + def block_size_ratio( + self, + remote_block_size: int, + ) -> float: + """ + Calculate the block size ratio between local and remote TP. + """ + assert self.block_size % remote_block_size == 0, ( + f"Local block size {self.block_size} is not divisible " + f"by remote block size {remote_block_size} or vice versa." + ) + return self.block_size // remote_block_size + def tp_ratio_from_engine_id( self, remote_engine_id: EngineId, @@ -732,6 +749,13 @@ def tp_ratio_from_engine_id( remote_tp_size = self.remote_tp_size[remote_engine_id] return self.tp_ratio(remote_tp_size) + def block_size_ratio_from_engine_id( + self, + remote_engine_id: EngineId, + ) -> float: + remote_block_size = self.remote_block_size[remote_engine_id] + return self.block_size_ratio(remote_block_size) + def is_kv_replicated(self, engine_id: EngineId) -> bool: """ Whether the KV cache is replicated across TP workers due to the @@ -866,6 +890,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): # nixl_prepped_dlist_handle. self.src_xfer_side_handle: int = 0 + self.src_xfer_side_handles: dict[int, int] = {} # Map of engine_id -> nixl_prepped_dlist_handle (int)]. self.dst_xfer_side_handles: dict[EngineId, int] = {} @@ -925,6 +950,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): logger.debug("Detected kv cache layout %s", self.kv_cache_layout) self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size} + self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size} # With heterogeneous TP, P must wait for all assigned D TP workers to # finish reading before safely freeing the blocks. self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) @@ -936,6 +962,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): remote_tp_size=self._tp_size, # shared state is_mla=self.use_mla, total_num_kv_heads=self.model_config.get_total_num_kv_heads(), + block_size=self.block_size, + remote_block_size=self._block_size, attn_backend=backend, ) self._use_pallas = self.kv_topo._use_pallas @@ -987,9 +1015,13 @@ def _nixl_handshake( ) # Register Remote agent. + assert metadata.block_size <= self.block_size, ( + "nP > nD is not supported yet." + ) remote_agent_name = self.add_remote_agent( metadata, p_remote_rank, remote_tp_size ) + setup_agent_time = time.perf_counter() logger.debug( "NIXL handshake: add agent took: %s", @@ -1217,43 +1249,10 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self.num_regions *= 2 # Register local/src descr for NIXL xfer. - blocks_data = [] - for i, base_addr in enumerate(seen_base_addresses): - kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i) - # NOTE With heter-TP, more blocks are prepared than what are - # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We - # could create fewer, but then _get_block_descs_ids needs to - # select agent_meta.num_blocks instead of self.num_blocks for - # local descr, and that makes handling regular flow less clean. - for block_id in range(self.num_blocks): - block_offset = block_id * self.block_len_per_layer[i] - addr = base_addr + block_offset - # (addr, len, device id) - blocks_data.append((addr, kv_block_len, self.device_id)) - - if self.kv_topo.is_kv_layout_blocks_first: - # Separate and interleave K/V regions to maintain the same - # descs ordering. This is needed for selecting contiguous heads - # when split across TP ranks. - for block_id in range(self.num_blocks): - block_offset = block_id * self.block_len_per_layer[i] - addr = base_addr + block_offset - # Register addresses for V cache (K registered first). - v_addr = addr + kv_block_len - blocks_data.append((v_addr, kv_block_len, self.device_id)) - logger.debug( - "Created %s blocks for src engine %s and rank %s on device id %s", - len(blocks_data), - self.engine_id, - self.tp_rank, - self.device_id, - ) + self.seen_base_addresses = seen_base_addresses + self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size) - descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) - # NIXL_INIT_AGENT to be used for preparations of local descs. - self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist( - "NIXL_INIT_AGENT", descs - ) + self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle # TODO(mgoin): Hybrid memory allocator is currently disabled for # models with local attention (Llama 4). Can remove this once enabled. @@ -1289,8 +1288,62 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): kv_cache_layout=self.kv_cache_layout if not self.use_host_buffer else self.host_buffer_kv_cache_layout, + block_size=self.block_size, + ) + + def register_local_xfer_handler( + self, + block_size: int, + ) -> int: + """ + Function used for register local xfer handler with local block_size or + Remote block_size. + + When local block_size is same as remote block_size, we use local block_size + to register local_xfer_handler during init. + + When remote block size is less than local block size, we need to use + register another local_xfer_handler using remote block len to ensure + data copy correctness. + """ + block_size_ratio = self.block_size // block_size + blocks_data = [] + for i, base_addr in enumerate(self.seen_base_addresses): + # The new block_len is using prefill block_len; + # and num_blocks is multiple with N + kv_block_len = ( + self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio + ) + block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio + num_blocks = self.num_blocks * block_size_ratio + for block_id in range(num_blocks): + block_offset = block_id * block_len_per_layer + addr = base_addr + block_offset + # (addr, len, device id) + blocks_data.append((addr, kv_block_len, self.device_id)) + + if self.kv_topo.is_kv_layout_blocks_first: + # Separate and interleave K/V regions to maintain the same + # descs ordering. This is needed for selecting contiguous heads + # when split across TP ranks. + for block_id in range(num_blocks): + block_offset = block_id * block_len_per_layer + addr = base_addr + block_offset + # Register addresses for V cache (K registered first). + v_addr = addr + kv_block_len + blocks_data.append((v_addr, kv_block_len, self.device_id)) + logger.debug( + "Created %s blocks for src engine %s and rank %s on device id %s", + len(blocks_data), + self.engine_id, + self.tp_rank, + self.device_id, ) + descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) + # NIXL_INIT_AGENT to be used for preparations of local descs. + return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs) + def add_remote_agent( self, nixl_agent_meta: NixlAgentMetadata, @@ -1349,6 +1402,8 @@ def add_remote_agent( ### Register remote agent metadata if engine_id not in self._tp_size: self._tp_size[engine_id] = remote_tp_size + if engine_id not in self._block_size: + self._block_size[engine_id] = nixl_agent_meta.block_size remote_agent_name = self.nixl_wrapper.add_remote_agent( nixl_agent_meta.agent_metadata @@ -1359,6 +1414,13 @@ def add_remote_agent( # Create dst descs and xfer side handles. TP workers have same #blocks # so we only register once per engine_id. + # Example: + # block_size_ratio > 1: + # remote: | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12| + # local origin:| 0| 1| 8| 12| + # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15| + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id) + if engine_id not in self.dst_num_blocks: self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks @@ -1381,8 +1443,14 @@ def add_remote_agent( # Register all remote blocks, but only the corresponding kv heads. for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr): kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i) + remote_kv_block_len = kv_block_len // block_size_ratio + if block_size_ratio > 1: + # using remote kv_block_len as transfer unit + kv_block_len = remote_kv_block_len rank_offset = ( - self.tp_rank % tp_ratio * kv_block_len if not replicates_kv_cache else 0 + self.tp_rank % tp_ratio * remote_kv_block_len + if not replicates_kv_cache + else 0 ) for block_id in range(nixl_agent_meta.num_blocks): block_offset = block_id * nixl_agent_meta.block_lens[i] @@ -1417,6 +1485,13 @@ def add_remote_agent( remote_agent_name, descs ) + if block_size_ratio > 1: + # when prefill with smaller block_size, we need to init a + # new handler with same block_len to match + self.src_xfer_side_handles[nixl_agent_meta.block_size] = ( + self.register_local_xfer_handler(nixl_agent_meta.block_size) + ) + return remote_agent_name def _validate_remote_agent_handshake( @@ -1433,6 +1508,9 @@ def _validate_remote_agent_handshake( assert nixl_agent_meta.attn_backend_name == self.backend_name tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id) + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( + remote_engine_id + ) assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP" assert not self._use_pallas or tp_ratio == 1, ( "TPU (pallas_v1) DOES NOT support heterogeneous TP yet." @@ -1463,33 +1541,26 @@ def _validate_remote_agent_handshake( remote_block_len = nixl_agent_meta.block_lens[0] if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id): # With replicated KV cache, only the number of blocks can differ. - assert self.block_len_per_layer == nixl_agent_meta.block_lens, ( - "KV cache sizes must match between P and D when replicated" - ) - remote_block_size = remote_block_len // (self.slot_size_per_layer[0]) + for i in range(len(self.block_len_per_layer)): + assert ( + self.block_len_per_layer[i] // block_size_ratio + == nixl_agent_meta.block_lens[i] + ), "KV cache sizes must match between P and D when replicated" else: # When MLA is not used, this is a list of the same block length for block_len in nixl_agent_meta.block_lens: assert block_len == remote_block_len, ( "All remote layers must have the same block size" ) - remote_block_size = remote_block_len // ( - self.slot_size_per_layer[0] * tp_ratio - ) - if self.kv_topo.is_kv_layout_blocks_first: - # With flashinfer, KV are sent in the same message. - remote_block_size //= 2 - assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, ( + assert ( + remote_block_len + == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio + ), ( "Remote P worker KV layer cache must be of shape [2, N, " "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype." ) - assert self.block_size == remote_block_size, ( - "Remote P worker with different page/block size is not supported " - f"{self.block_size=}, {remote_block_size=}" - ) - # TP workers have same #blocks. assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks @@ -1576,6 +1647,56 @@ def permute_device_kv(self, block_ids: list[int]): ) cache.index_copy_(0, indices, permuted_blocks) + def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]): + def _process_local_gt_remote(blocks_to_update, block_size_ratio): + n_kv_heads, block_size, head_size = blocks_to_update.shape[1:] + remote_block_size = block_size // block_size_ratio + n_blocks = block_size_ratio + # actual permute is to convert + # for local blocksize > remote blocksize + # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens + # local block[0] = remote block[0, 1, 2, 3] + # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|... + # local is |h0-b0..................|h1-b0..................|... + # permute is to: + # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D) + # 2. permute => (H, nblocks, remoteN, D) + # 3. flatten => (H, localN, D) + permuted_blocks = ( + blocks_to_update.reshape( + -1, n_blocks, n_kv_heads, remote_block_size, head_size + ) + .permute(0, 2, 1, 3, 4) + .flatten(2, 3) + ) + return permuted_blocks + + if len(self.device_kv_caches) == 0: + return + split_k_and_v = not ( + self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first + ) + sample_cache = list(self.device_kv_caches.values())[0][0] + for block_size_ratio, block_ids_list in block_ids_per_ratio.items(): + assert block_size_ratio > 1, "Only nP < nD supported currently." + block_ids_list = [[item for sublist in block_ids_list for item in sublist]] + + for block_ids in block_ids_list: + indices = torch.tensor(block_ids, device=sample_cache.device) + + for _, cache_or_caches in self.device_kv_caches.items(): + cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] + for cache in cache_list: + blocks_to_update = cache.index_select(0, indices) + # because kv_cache is always using original layout NHD as + # virtual shape while stride can be either HND / NHD at + # initialization. + # we need to firstly get physical view of the tensor + permuted_blocks = _process_local_gt_remote( + blocks_to_update.permute(0, 2, 1, 3), block_size_ratio + ).permute(0, 2, 1, 3) + cache.index_copy_(0, indices, permuted_blocks) + def get_finished(self) -> tuple[set[str], set[str]]: """ Get requests that are done sending or recving on this specific worker. @@ -1599,6 +1720,7 @@ def get_finished(self) -> tuple[set[str], set[str]]: ) block_ids_to_permute = [] + block_ids_for_blocksize_post_process = defaultdict(list) for req_id in done_recving: # clean up metadata for completed requests meta = self._recving_metadata.pop(req_id, None) @@ -1607,6 +1729,20 @@ def get_finished(self) -> tuple[set[str], set[str]]: self.sync_recved_kv_to_device(req_id, meta) if self.enable_permute_local_kv: block_ids_to_permute += meta.local_physical_block_ids + + # post processing for heteroblocksize + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( + meta.remote_engine_id + ) + if ( + not self.use_mla + and block_size_ratio > 1 + and self.kv_cache_layout == "HND" + ): + block_ids_for_blocksize_post_process[block_size_ratio].append( + meta.local_block_ids + ) + self.blocksize_post_process(block_ids_for_blocksize_post_process) if len(block_ids_to_permute) > 0: self.permute_device_kv(block_ids_to_permute) @@ -1781,6 +1917,24 @@ def _read_blocks( dst_engine_id: str, request_id: str, ): + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) + if block_size_ratio > 1: + local_block_ids = self.get_mapped_blocks( + np.asarray(local_block_ids), block_size_ratio + ) + if len(local_block_ids) > len(remote_block_ids): + # NOTE: + # get_mapped_blocks will always expand block_ids for n times. + # ex: + # prefill block_ids with block_size as 4: + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + # Local decode block_ids with block_size as 16: [1, 2, 3] + # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + # Then we clip local to align with prefill + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + local_block_ids = local_block_ids[: len(remote_block_ids)] # NOTE(rob): having the staging blocks be on the READER side is # not going to work well (since we will have to call rearrange tensors). # after we detect the txn is complete (which means we cannot make the @@ -1823,7 +1977,10 @@ def _read_blocks( remote_block_ids = remote_block_ids[-num_local_blocks:] # Get side handles. - local_xfer_side_handle = self.src_xfer_side_handle + remote_block_size = self.kv_topo.remote_block_size[dst_engine_id] + local_xfer_side_handle = self.src_xfer_side_handles.get( + remote_block_size, self.src_xfer_side_handle + ) remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from @@ -1833,13 +1990,17 @@ def _read_blocks( # Get descs ids. local_block_descs_ids: np.ndarray remote_block_descs_ids: np.ndarray + if not self.block_window_per_layer: # Default case: assume global attention remote_block_descs_ids = self._get_block_descs_ids( - dst_engine_id, remote_block_ids + dst_engine_id, + remote_block_ids, ) local_block_descs_ids = self._get_block_descs_ids( - self.engine_id, local_block_ids + self.engine_id, + local_block_ids, + block_size_ratio=block_size_ratio, ) else: # TODO(mgoin): remove this once we have hybrid memory allocator @@ -1860,10 +2021,15 @@ def _read_blocks( # Get descs ids for the layer. layer_local_desc_ids = self._get_block_descs_ids( - self.engine_id, layer_local_block_ids, layer_idx + dst_engine_id, + layer_local_block_ids, + layer_idx, ) layer_remote_desc_ids = self._get_block_descs_ids( - dst_engine_id, layer_remote_block_ids, layer_idx + self.engine_id, + layer_remote_block_ids, + layer_idx, + block_size_ratio=block_size_ratio, ) local_descs_list.append(layer_local_desc_ids) @@ -1905,8 +2071,31 @@ def _read_blocks( self.nixl_wrapper.release_xfer_handle(handle) self._failed_recv_reqs.add(request_id) + def get_mapped_blocks(self, block_ids, block_size_ratio): + """ + Calculates the new set of block IDs by mapping every element + in the (potentially sparse) input array. + Example: block_ids=[0, 2], block_size_ratio=2 + get_mapped_blocks 0 1 [2 3] 4 5 + # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1|| + # local is |h0-b0......||h1-b0......||h2-b0........ + local_block_ids 0 [1] 2 + """ + if block_ids.size == 0: + return np.array([], dtype=np.int64) + + start_ids = block_ids * block_size_ratio + offsets = np.arange(block_size_ratio) + mapped_2d = start_ids[:, None] + offsets[None, :] + + return mapped_2d.flatten().astype(np.int64) + def _get_block_descs_ids( - self, engine_id: str, block_ids: list[int], layer_idx: int | None = None + self, + engine_id: str, + block_ids: list[int], + layer_idx: int | None = None, + block_size_ratio: float | None = None, ) -> np.ndarray: """ Get the descs ids for a set of block ids. @@ -1929,6 +2118,8 @@ def _get_block_descs_ids( region_ids = np.arange(layer_idx, layer_idx + 1) num_blocks = self.dst_num_blocks[engine_id] + if block_size_ratio is not None: + num_blocks = int(num_blocks * block_size_ratio) # Compute the desc ids for each block. region_ids = region_ids[:, None] From 6965ef436fb398bfbbdce5b6f88dd842c5944771 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 15 Nov 2025 00:52:14 -0500 Subject: [PATCH 091/578] [Performance][DeepGEMM] Estimate expected_m (#28694) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/kernels/moe/test_deepep_deepgemm_moe.py | 46 ++++++++++++++----- vllm/forward_context.py | 4 ++ .../layers/fused_moe/batched_deep_gemm_moe.py | 40 ++++++++++++++-- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 0faf8bc95d2e..455ecacef5ec 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -7,6 +7,7 @@ """ import dataclasses +from contextlib import contextmanager import pytest import torch.distributed @@ -14,6 +15,7 @@ from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config +from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, @@ -61,6 +63,23 @@ P = ParamSpec("P") +@contextmanager +def with_dp_metadata(M: int, world_size: int): + num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int) + + vllm_config = VllmConfig() + vllm_config.parallel_config.data_parallel_size = world_size + vllm_config.parallel_config.enable_expert_parallel = True + + with set_forward_context( + None, + vllm_config, + num_tokens=M, + num_tokens_across_dp=num_tokens_across_dp, + ): + yield + + def next_power_of_2(x): import math @@ -285,18 +304,21 @@ def build_expert_map(): quant_config=quant_config, ) - out = mk.forward( - hidden_states=test_tensors.rank_tokens, - w1=w1, - w2=w2, - topk_weights=test_tensors.topk_weights, - topk_ids=test_tensors.topk, - inplace=False, - activation="silu", - global_num_experts=num_experts, - expert_map=build_expert_map(), - apply_router_weight_on_input=False, - ) + with with_dp_metadata( + M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size + ): + out = mk.forward( + hidden_states=test_tensors.rank_tokens, + w1=w1, + w2=w2, + topk_weights=test_tensors.topk_weights, + topk_ids=test_tensors.topk, + inplace=False, + activation="silu", + global_num_experts=num_experts, + expert_map=build_expert_map(), + apply_router_weight_on_input=False, + ) return out diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 44bc2a4cda31..25fb7181a8f2 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -221,6 +221,10 @@ def get_forward_context() -> ForwardContext: return _forward_context +def is_forward_context_available() -> bool: + return _forward_context is not None + + def create_forward_context( attn_metadata: Any, vllm_config: VllmConfig, diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 79c92eb48612..53362277dae8 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( @@ -19,7 +20,7 @@ get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, ) -from vllm.utils.math_utils import cdiv +from vllm.utils.math_utils import cdiv, round_up logger = init_logger(__name__) @@ -313,6 +314,33 @@ def workspace_shapes( output = (num_experts, max_num_tokens * num_dispatchers, K) return (workspace13, workspace2, output) + def estimate_expected_m( + self, global_num_experts: int, max_tokens_per_expert: int, topk: int + ) -> int: + dp_meta = ( + get_forward_context().dp_metadata + if is_forward_context_available() + else None + ) + if dp_meta is None: + logger.warning_once( + "DPMetadata unavailable. Defaulting expected_m to " + f"{max_tokens_per_expert}.", + scope="local", + ) + return max_tokens_per_expert + + total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item() + total_num_tokens_replicated = total_num_tokens * topk + + # Assume even load balancing + assert global_num_experts != 0 + estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16) + # clamp estimate + estimate = max(estimate, 16) + estimate = min(max_tokens_per_expert, estimate) + return estimate + def apply( self, output: torch.Tensor, @@ -348,10 +376,12 @@ def apply( workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N)) - # (from deepgemm docs) : A value hint (which is a value on CPU) - # for the M expectation of each batch, correctly setting this value - # may lead to better performance. - expected_m = max_num_tokens + expected_m = self.estimate_expected_m( + global_num_experts=global_num_experts, + max_tokens_per_expert=max_num_tokens, + topk=topk_ids.size(-1), + ) + fp8_m_grouped_gemm_nt_masked( (a1q, a1q_scale), (w1, self.w1_scale), From 98b4d389ed27f09fd185ade889a02f640a3ff0b4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Nov 2025 14:47:41 +0800 Subject: [PATCH 092/578] [Redo] #26368 (#28771) Signed-off-by: Jialin Ouyang Signed-off-by: DarkLight1337 Co-authored-by: Jialin Ouyang --- tests/v1/core/test_async_scheduler.py | 3 +- .../v1/core/test_priority_scheduler_random.py | 6 +- tests/v1/core/test_scheduler.py | 88 +++++++++++-------- .../kv_connector/unit/test_nixl_connector.py | 7 +- tests/v1/kv_connector/unit/utils.py | 3 +- tests/v1/spec_decode/test_eagle.py | 5 +- tests/v1/spec_decode/test_ngram.py | 18 ++-- vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/outputs.py | 4 +- vllm/v1/sample/rejection_sampler.py | 8 +- vllm/v1/spec_decode/eagle.py | 7 +- vllm/v1/spec_decode/ngram_proposer.py | 6 +- vllm/v1/spec_decode/suffix_decoding.py | 10 ++- vllm/v1/worker/gpu_model_runner.py | 36 +++++--- vllm/v1/worker/tpu_model_runner.py | 8 +- 15 files changed, 122 insertions(+), 91 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index e0645ed43015..1d80ee987591 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque +import numpy as np import pytest from vllm.v1.core.sched.output import SchedulerOutput @@ -21,7 +22,7 @@ def _make_model_runner_output( return ModelRunnerOutput( req_ids=req_ids, req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)}, - sampled_token_ids=[[i] for i in range(len(req_ids))], + sampled_token_ids=[np.array([i]) for i in range(len(req_ids))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py index b4805be80272..ba0b703302e3 100644 --- a/tests/v1/core/test_priority_scheduler_random.py +++ b/tests/v1/core/test_priority_scheduler_random.py @@ -3,6 +3,7 @@ import random import uuid +import numpy as np import pytest from vllm.config import VllmConfig @@ -99,8 +100,7 @@ def _mock_execute_model( random.randint(*num_output_tokens_range) for _ in range(len(request_ids)) ] sampled_token_ids = [ - [random.randint(0, 100) for _ in range(num_tokens)] - for num_tokens in num_output_tokens + np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens ] return ModelRunnerOutput( @@ -196,6 +196,8 @@ def test_priority_scheduling_blast( num_blocks: int, ): random.seed(42) + np.random.seed(42) + seen_request_prompt_length = dict[str, int]() seen_request_ids = set[str]() seen_mm_hashes = set[str]() diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 04e738293cd7..0570c0854c67 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -3,6 +3,7 @@ import dataclasses from unittest.mock import Mock +import numpy as np import pytest import torch @@ -169,7 +170,7 @@ def test_schedule_partial_requests(): req_id_to_index=req_to_index, # Only the first request has a sampled token id because # the rest requests are still being prefilled. - sampled_token_ids=[[0], [], []], + sampled_token_ids=[np.array([0]), np.array([]), np.array([])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -216,7 +217,7 @@ def test_no_mm_input_chunking(): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[] for _ in range(len(requests))], + sampled_token_ids=[np.array([]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -276,7 +277,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[] for _ in range(len(requests))], + sampled_token_ids=[np.array([]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -300,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], req_id_to_index=req_to_index, - sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], + sampled_token_ids=[np.array([0]), np.array([0])] + + [np.array([]) for _ in range(len(requests) - 2)], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -347,8 +349,8 @@ def test_stop_via_update_from_output(): req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[ - [EOS_TOKEN_ID], - [10, 11], + np.array([EOS_TOKEN_ID]), + np.array([10, 11]), ], # First request hits EOS, second continues logprobs=None, prompt_logprobs_dict={}, @@ -392,7 +394,10 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token + sampled_token_ids=[ + np.array([10, 42, 12]), + np.array([13, 14]), + ], # First request hits stop token logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -436,7 +441,10 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens + sampled_token_ids=[ + np.array([10, 11, 12]), + np.array([13]), + ], # First request exceeds max_tokens logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -475,7 +483,7 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -616,7 +624,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -633,7 +641,7 @@ def test_schedule_concurrent_batches( model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -670,7 +678,7 @@ def test_preempt_during_execution(): model_runner_output0 = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -687,7 +695,7 @@ def test_preempt_during_execution(): model_runner_output1 = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[42]], + sampled_token_ids=[np.array([42])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -704,14 +712,18 @@ def test_preempt_during_execution(): @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", [ - ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match - ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch - ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])), # multiple sequences - ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence - ([[]], [[5]], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])), # early mismatch + ( + [[1, 2], [3]], + [np.array([1, 2, 5]), np.array([3, 4])], + (2, 3, 3, [2, 1]), + ), # multiple sequences + ([[1]], [np.array([1, 2])], (1, 1, 1, [1])), # single token sequence + ([[]], [np.array([5])], (0, 0, 0, [0])), # empty sequence ( [[1, 2, 3], [4, 5, 6]], - [[1, 2, 7], [4, 8]], + [np.array([1, 2, 7]), np.array([4, 8])], (2, 6, 3, [2, 1, 0]), ), # multiple mismatches ], @@ -745,7 +757,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[0] for _ in range(len(requests))], + sampled_token_ids=[np.array([0]) for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -972,7 +984,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1025,7 +1037,7 @@ def test_kv_connector_basic(is_async: bool): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1088,7 +1100,7 @@ def test_external_prefix_cache_metrics(): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=[r.request_id for r in requests], req_id_to_index={r.request_id: i for i, r in enumerate(requests)}, - sampled_token_ids=[[1000]] * NUM_REQUESTS, + sampled_token_ids=[np.array([1000])] * NUM_REQUESTS, logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1154,7 +1166,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1239,7 +1251,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1332,7 +1344,7 @@ def make_output(scheduler: Scheduler): return ModelRunnerOutput( req_ids=[req.request_id for req in scheduler.running], req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)}, - sampled_token_ids=[[1000]] * len(scheduler.running), + sampled_token_ids=[np.array([1000])] * len(scheduler.running), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1749,7 +1761,7 @@ def test_priority_scheduling_preemption(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[[100] for _ in low_priority_requests], + sampled_token_ids=[np.array([100]) for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -1818,7 +1830,7 @@ def test_priority_scheduling_no_preemption_when_space_available(): req_id_to_index={ req.request_id: i for i, req in enumerate(low_priority_requests) }, - sampled_token_ids=[[100] for _ in low_priority_requests], + sampled_token_ids=[np.array([100]) for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2064,7 +2076,7 @@ def test_priority_scheduling_heap_property(): model_output = ModelRunnerOutput( req_ids=[req.req_id], req_id_to_index={req.req_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2150,7 +2162,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[request_low.request_id], req_id_to_index={request_low.request_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2181,7 +2193,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[100] for _ in requests], + sampled_token_ids=[np.array([100]) for _ in requests], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2207,7 +2219,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[], [100]], + sampled_token_ids=[np.array([]), np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2624,7 +2636,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector): model_output = ModelRunnerOutput( req_ids=[request1.request_id], req_id_to_index={request1.request_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2830,7 +2842,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[1000]] * len(req_ids), + sampled_token_ids=[np.array([1000])] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, pooler_output=[], @@ -2943,7 +2955,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( model_output = ModelRunnerOutput( req_ids=[request_low.request_id], req_id_to_index={request_low.request_id: 0}, - sampled_token_ids=[[100]], + sampled_token_ids=[np.array([100])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -2994,7 +3006,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[100] for _ in requests], + sampled_token_ids=[np.array([100]) for _ in requests], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -3029,7 +3041,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, - sampled_token_ids=[[100], [100, 200]], + sampled_token_ids=[np.array([100]), np.array([100, 200])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, @@ -3215,7 +3227,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto model_output = ModelRunnerOutput( req_ids=[request1.request_id, request2.request_id], req_id_to_index={request1.request_id: 0, request2.request_id: 1}, - sampled_token_ids=[[100], [121]], + sampled_token_ids=[np.array([100]), np.array([121])], # spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index b7d7a10057b8..b264e5108c16 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -11,6 +11,7 @@ from collections import defaultdict from unittest.mock import patch +import numpy as np import pytest import ray import torch @@ -826,7 +827,7 @@ def test_kv_connector_stats_aggregation(): output = ModelRunnerOutput( req_ids=[f"req_{i}"], req_id_to_index={f"req_{i}": 0}, - sampled_token_ids=[[123]], # dummy token + sampled_token_ids=[np.array([123])], # dummy token logprobs=None, prompt_logprobs_dict={}, pooler_output=[None], @@ -907,7 +908,7 @@ def make_multi_stats(nixl_count: int, foo_count: int) -> MultiKVConnectorStats: output = ModelRunnerOutput( req_ids=[f"req_{i}"], req_id_to_index={f"req_{i}": 0}, - sampled_token_ids=[[123]], + sampled_token_ids=[np.array([123])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[None], @@ -965,7 +966,7 @@ def test_scheduler_kv_connector_stats_aggregation(): model_output = ModelRunnerOutput( req_ids=["req_0"], req_id_to_index={"req_0": 0}, - sampled_token_ids=[[123]], + sampled_token_ids=[np.array([123])], logprobs=None, prompt_logprobs_dict={}, pooler_output=[None], diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index f35f91bb3adf..c248104d5b5e 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -7,6 +7,7 @@ from itertools import chain, count from typing import Any +import numpy as np import torch from vllm import SamplingParams @@ -228,7 +229,7 @@ def create_model_runner_output( # Make sampled tokens. sampled_token = EOS_TOKEN_ID if use_eos else token_id - sampled_token_ids = [[sampled_token] for _ in req_ids] + sampled_token_ids = [np.array([sampled_token]) for _ in req_ids] kv_connector_output = ( None diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 89d0ec769ac0..421da5241555 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -3,6 +3,7 @@ from unittest import mock +import numpy as np import pytest import torch @@ -112,7 +113,9 @@ def test_prepare_next_token_ids(): sampled_token_ids_tensor = torch.tensor( sampled_token_ids, dtype=torch.int32, device=device ) - sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids] + sampled_token_ids_cpu = [ + np.array([i for i in seq if i != -1]) for seq in sampled_token_ids + ] expected_next_token_ids_cpu = [1, 4, 30, 40] expected_next_token_ids_tensor = torch.tensor( diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 692c39282c37..563bc1d957f4 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match. token_ids_cpu = np.array([[1, 2, 3, 4, 5]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # No match for 4-gram but match for 3-gram. token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # In this case, the proposer should return the 4-gram match. token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Match for 2-gram and 3-gram, but not 4-gram. token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]]) result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # Multiple 3-gram matched, but always pick the first one. token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]]) result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # check empty input token_ids_cpu = np.array([[]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0]], + sampled_token_ids=[np.array([0])], req_ids=["0"], num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]), token_ids_cpu=token_ids_cpu, @@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: # second request has 3 tokens and no match. Padded with -1 for max len 5 token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]]) result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose( - sampled_token_ids=[[0], [1]], + sampled_token_ids=[np.array([0]), np.array([1])], req_ids=["0", "1"], num_tokens_no_spec=np.array([5, 3]), token_ids_cpu=token_ids_cpu, @@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: input_2[:3] = [4, 5, 6] token_ids_cpu = np.array([input_1, input_2]) result = ngram_proposer.propose( - sampled_token_ids=[[0], [1]], + sampled_token_ids=[np.array([0]), np.array([1])], req_ids=["0", "1"], num_tokens_no_spec=np.array([len(input_1), 3]), token_ids_cpu=token_ids_cpu, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ba7ad0c09173..c640c40a455d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1010,8 +1010,8 @@ def update_from_output( continue req_index = model_runner_output.req_id_to_index[req_id] - generated_token_ids = ( - sampled_token_ids[req_index] if sampled_token_ids else [] + generated_token_ids: list[int] = ( + sampled_token_ids[req_index].tolist() if sampled_token_ids else [] ) scheduled_spec_token_ids = ( diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index e32d5bb608b1..c0b2835c3124 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -158,7 +158,7 @@ class ModelRunnerOutput: # num_generated_tokens is the number of tokens # generated in the current step. It can be different for # each request due to speculative/jump decoding. - sampled_token_ids: list[list[int]] + sampled_token_ids: list[np.ndarray] # [num_reqs, max_num_logprobs + 1] # [num_reqs, max_num_logprobs + 1] @@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output( req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)} # No tokens generated yet ⇒ one empty list per request - sampled_token_ids: list[list[int]] = [[0] for _ in req_ids] + sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids] # Pooler outputs are not available yet ⇒ use None placeholders pooler_output: list[torch.Tensor | None] = [None for _ in req_ids] diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 926305d25f56..f31a0cddda9a 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -3,6 +3,7 @@ from dataclasses import replace +import numpy as np import torch import torch.nn as nn @@ -204,7 +205,7 @@ def _get_logprobs_tensors( def parse_output( output_token_ids: torch.Tensor, vocab_size: int, - ) -> list[list[int]]: + ) -> list[np.ndarray]: """Parse the output of the rejection sampler. Args: output_token_ids: The sampled token IDs in shape @@ -220,10 +221,7 @@ def parse_output( valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & ( output_token_ids_np < vocab_size ) - outputs = [ - row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np) - ] - return outputs + return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)] def apply_logits_processors( self, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index beef5203e039..f3b34544f8d9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -484,7 +484,7 @@ def propose( def prepare_next_token_ids_cpu( self, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], requests: dict[str, CachedRequestState], gpu_input_batch: InputBatch, num_scheduled_tokens: dict[str, int], @@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu( req_ids = gpu_input_batch.req_ids next_token_ids: list[int] = [] for i, token_ids in enumerate(sampled_token_ids): - if token_ids: + if token_ids.shape[0] > 0: # Common case. next_token_id = token_ids[-1] else: @@ -510,10 +510,9 @@ def prepare_next_token_ids_cpu( seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id] next_token_id = req_state.get_token_id(seq_len) next_token_ids.append(next_token_id) - next_token_ids = torch.tensor( + return torch.tensor( next_token_ids, dtype=torch.int32, device=self.input_ids.device ) - return next_token_ids def prepare_next_token_ids_padded( self, diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index e2f83cb24aa9..378937dba988 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig): # Trigger Numba JIT compilation for N-gram proposer. # This usually takes less than 1 second. self.propose( - [[]] * 1024, + [np.array([])] * 1024, [""] * 1024, np.zeros(1024, dtype=np.int32), np.zeros((1024, self.max_model_len), dtype=np.int32), @@ -131,7 +131,7 @@ def batch_propose( def propose( self, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], req_ids: list[str], num_tokens_no_spec: np.ndarray, token_ids_cpu: np.ndarray, @@ -140,7 +140,7 @@ def propose( # find which requests need ngram proposals valid_ngram_requests = [] for i, sampled_ids in enumerate(sampled_token_ids): - num_sampled_ids = len(sampled_ids) + num_sampled_ids = sampled_ids.shape[0] if not num_sampled_ids: # Skip speculative decoding. continue diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py index 049e335db325..d76e0ffe778d 100644 --- a/vllm/v1/spec_decode/suffix_decoding.py +++ b/vllm/v1/spec_decode/suffix_decoding.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import numpy as np + from vllm.config import VllmConfig from vllm.v1.worker.gpu_input_batch import InputBatch @@ -32,16 +34,16 @@ def __init__(self, vllm_config: VllmConfig): def propose( self, input_batch: InputBatch, - sampled_token_ids: list[list[int]], + sampled_token_ids: list[np.ndarray], ) -> list[list[int]]: """ Propose speculative tokens for each request in the input batch. Suffix Decoding will speculate a dynamic number of tokens for each request every decoding step, so each entry in the returned list may have different lengths. """ - draft_token_ids: list[list[int]] = [] + draft_token_ids: list[np.ndarray] = [] for i, sampled_ids in enumerate(sampled_token_ids): - if not sampled_ids: + if sampled_ids.shape[0] == 0: # Skip speculative decoding for partial prefills. draft_token_ids.append([]) continue @@ -70,7 +72,7 @@ def propose( self.suffix_cache.start_request(req_id, prompt_token_ids) # Append the newly sampled ids to the suffix cache for this request. - self.suffix_cache.add_active_response(req_id, sampled_ids) + self.suffix_cache.add_active_response(req_id, sampled_ids.tolist()) # Suffix decoding only uses the most recent tokens up to max_tree_depth, so # we extract the pattern from the end of the input. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9b3e5b668aab..d0d6164180e6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -216,9 +216,11 @@ def get_output(self) -> ModelRunnerOutput: del self._logprobs_tensors del self._sampled_token_ids - valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() + valid_sampled_token_ids: list[np.ndarray] = [ + row for row in self.sampled_token_ids_cpu.numpy() + ] for i in self._invalid_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids[i] = np.array([]) output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids @@ -2339,7 +2341,7 @@ def _bookkeeping_sync( ) -> tuple[ dict[str, int], LogprobsLists | None, - list[list[int]], + list[np.ndarray], dict[str, LogprobsTensors | None], list[str], dict[str, int], @@ -2365,6 +2367,7 @@ def _bookkeeping_sync( num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids invalid_req_indices = [] + valid_sampled_token_ids: list[np.ndarray] if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -2379,7 +2382,7 @@ def _bookkeeping_sync( ) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[int(i)].clear() + valid_sampled_token_ids[int(i)] = np.array([]) else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() @@ -2407,19 +2410,24 @@ def _bookkeeping_sync( [0] if spec_decode_metadata and logprobs_tensors else None ) for req_idx in range(num_sampled_tokens): + sampled_ids: np.ndarray | None if self.use_async_scheduling: - sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None + sampled_ids = ( + np.array([-1]) if req_idx not in invalid_req_indices_set else None + ) else: sampled_ids = valid_sampled_token_ids[req_idx] - num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 + num_sampled_ids: int = ( + sampled_ids.shape[0] if sampled_ids is not None else 0 + ) if cu_num_accepted_tokens is not None: cu_num_accepted_tokens.append( cu_num_accepted_tokens[-1] + num_sampled_ids ) - if not sampled_ids: + if sampled_ids is None or num_sampled_ids == 0: continue start_idx = self.input_batch.num_tokens_no_spec[req_idx] @@ -2761,7 +2769,9 @@ def sample_tokens( with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) - def propose_draft_token_ids(sampled_token_ids): + def propose_draft_token_ids( + sampled_token_ids: torch.Tensor | list[np.ndarray], + ) -> None: assert spec_decode_common_attn_metadata is not None with record_function_or_nullcontext("gpu_model_runner: draft"): self._draft_token_ids = self.propose_draft_token_ids( @@ -2883,14 +2893,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None: def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", - sampled_token_ids: torch.Tensor | list[list[int]], + sampled_token_ids: torch.Tensor | list[np.ndarray], sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, aux_hidden_states: list[torch.Tensor] | None, spec_decode_metadata: SpecDecodeMetadata | None, common_attn_metadata: CommonAttentionMetadata, - ) -> list[list[int]] | torch.Tensor: + ) -> torch.Tensor | list[list[int]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if self.speculative_config.method == "ngram": assert isinstance(sampled_token_ids, list) @@ -2922,7 +2932,7 @@ def propose_draft_token_ids( for num_draft, tokens in zip( spec_decode_metadata.num_draft_tokens, sampled_token_ids ): - indices.append(offset + len(tokens) - 1) + indices.append(offset + tokens.shape[0] - 1) offset += num_draft + 1 indices = torch.tensor(indices, device=self.device) hidden_states = sample_hidden_states[indices] @@ -4862,7 +4872,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return kv_cache_spec - def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]: # This is a short term mitigation for issue mentioned in # https://github.com/vllm-project/vllm/issues/22754. # `tolist` would trigger a cuda wise stream sync, which @@ -4875,4 +4885,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: pinned.copy_(sampled_token_ids, non_blocking=True) self.transfer_event.record() self.transfer_event.synchronize() - return pinned.tolist() + return [row for row in pinned.numpy()] diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 01490e0dfac9..e9eb7cad38f8 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1254,13 +1254,15 @@ def concat_lists(input_lists): max_gen_len = selected_token_ids.shape[-1] if max_gen_len == 1: - valid_sampled_token_ids = selected_token_ids.tolist() + valid_sampled_token_ids: list[np.ndarray] = [ + row for row in selected_token_ids.numpy() + ] # Mask out the sampled tokens that should not be sampled. # TODO: Keep in sync with gpu_model_runner.py, in particular # the "else" case here for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids[i] = np.array([]) # Append sampled tokens for i, req_state, seq_len in request_seq_lens: @@ -1273,7 +1275,7 @@ def concat_lists(input_lists): valid_mask = selected_token_ids != INVALID_TOKEN_ID gen_lens = valid_mask.sum(dim=1).tolist() valid_sampled_token_ids = [ - seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens) + seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens) ] self.input_batch.num_tokens[:num_reqs] += gen_lens for i, req_state, seq_len in request_seq_lens: From dd6ac1c2bb3d29f8ba612a2f66f350a2c55c7e8b Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Fri, 14 Nov 2025 23:59:42 -0800 Subject: [PATCH 093/578] [RL] [V1] Remove unused device argument from reset_kv_cache (#28766) Signed-off-by: Zhuohan Li --- vllm/engine/protocol.py | 2 +- vllm/entrypoints/llm.py | 5 ++--- vllm/entrypoints/openai/api_server.py | 10 +++------- vllm/v1/engine/async_llm.py | 6 ++---- vllm/v1/engine/llm_engine.py | 3 +-- 5 files changed, 9 insertions(+), 17 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 24fcd9fe1cab..462d2c4e50e7 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -125,7 +125,7 @@ async def reset_mm_cache(self) -> None: ... @abstractmethod - async def reset_prefix_cache(self, device: Device | None = None) -> None: + async def reset_prefix_cache(self) -> None: """Reset the prefix cache""" ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 62717a7eacdf..b0786bd355aa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -32,7 +32,6 @@ TokenizerMode, ) from vllm.engine.arg_utils import EngineArgs -from vllm.engine.protocol import Device from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormatOption, @@ -1499,8 +1498,8 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.llm_engine.stop_profile() - def reset_prefix_cache(self, device: Device | None = None) -> None: - self.llm_engine.reset_prefix_cache(device) + def reset_prefix_cache(self) -> None: + self.llm_engine.reset_prefix_cache() def sleep(self, level: int = 1): """ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3e59af717d95..3cf66fcd27e2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -39,7 +39,7 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import Device, EngineClient +from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.protocol import ( AnthropicError, AnthropicErrorResponse, @@ -1069,12 +1069,8 @@ async def reset_prefix_cache(raw_request: Request): Reset the prefix cache. Note that we currently do not check if the prefix cache is successfully reset in the API server. """ - device = None - device_str = raw_request.query_params.get("device") - if device_str is not None: - device = Device[device_str.upper()] - logger.info("Resetting prefix cache with specific %s...", str(device)) - await engine_client(raw_request).reset_prefix_cache(device) + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() return Response(status_code=200) @router.post("/reset_mm_cache") diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 48ea6ef8515c..c160c7cbcab4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -14,7 +14,7 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import Device, EngineClient +from vllm.engine.protocol import EngineClient from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType from vllm.logger import init_logger @@ -672,9 +672,7 @@ async def reset_mm_cache(self) -> None: self.processor.clear_mm_cache() await self.engine_core.reset_mm_cache_async() - async def reset_prefix_cache(self, device: Device | None = None) -> None: - if device == Device.CPU: - raise ValueError("Not supported on CPU.") + async def reset_prefix_cache(self) -> None: await self.engine_core.reset_prefix_cache_async() async def sleep(self, level: int = 1) -> None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1db83446ba0b..e403cea87788 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -14,7 +14,6 @@ from vllm.distributed import stateless_destroy_torch_distributed_process_group from vllm.distributed.parallel_state import get_dp_group from vllm.engine.arg_utils import EngineArgs -from vllm.engine.protocol import Device from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -321,7 +320,7 @@ def reset_mm_cache(self): self.processor.clear_mm_cache() self.engine_core.reset_mm_cache() - def reset_prefix_cache(self, device: Device | None = None): + def reset_prefix_cache(self): self.engine_core.reset_prefix_cache() def sleep(self, level: int = 1): From 74b5267d3a2d49be548e488650d1504be0b3e3fe Mon Sep 17 00:00:00 2001 From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com> Date: Sat, 15 Nov 2025 04:10:15 -0500 Subject: [PATCH 094/578] Use narrow over indexing in `hadacore_transform` to prep for ABI stable (#28756) Signed-off-by: Jane Xu --- csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu index 5369d409f9b2..aff11326d78e 100644 --- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu +++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu @@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) { }); if (numel % 256 != 0) { - out = out.index({torch::indexing::Slice(0, numel / had_size)}); + out = out.narrow(0, 0, numel / had_size); } if (inplace && out.data_ptr() != x.data_ptr()) { From 1ec978c209391286d4cee968426900e9a4d256a5 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Sat, 15 Nov 2025 01:10:48 -0800 Subject: [PATCH 095/578] [Kernel][Moe Configs] llama4 maverick fp8 moe config tp8 on mi325 (#28709) Signed-off-by: Zhewen Li --- ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 000000000000..555d17364452 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} From 638e4196d15f14a5fe68a64000801abda6c2ef8f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Nov 2025 17:59:31 +0800 Subject: [PATCH 096/578] [Misc] Make `SchedulerConfig.max_model_len` init-only (#28733) Signed-off-by: DarkLight1337 --- tests/kernels/moe/test_batched_moe.py | 2 -- tests/kernels/moe/test_block_fp8.py | 2 -- tests/kernels/moe/test_block_int8.py | 2 -- tests/kernels/moe/test_cutlass_moe.py | 2 -- tests/kernels/moe/test_flashinfer.py | 2 -- tests/kernels/moe/test_moe.py | 2 -- tests/kernels/moe/test_pplx_cutlass_moe.py | 2 -- tests/kernels/moe/test_pplx_moe.py | 2 -- tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 2 -- tests/kernels/quantization/test_block_fp8.py | 2 -- tests/kernels/quantization/test_block_int8.py | 2 -- vllm/config/scheduler.py | 36 +++++++++---------- vllm/config/vllm.py | 1 - vllm/platforms/cpu.py | 2 +- vllm/platforms/tpu.py | 2 +- vllm/platforms/xpu.py | 2 +- vllm/v1/core/sched/scheduler.py | 2 +- 17 files changed, 22 insertions(+), 45 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 62704bbcbbc7..2285709fa7d6 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -40,8 +40,6 @@ TOP_KS = [1, 2, 6] vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 @dataclass diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index cd34617ee0fc..88db4b3e537c 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -33,8 +33,6 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 3799e60f1294..e35ca4caa9db 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -18,8 +18,6 @@ pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True) vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 DTYPES = [torch.bfloat16] diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 5512ccce47b0..c15837f14570 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -42,8 +42,6 @@ ] vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 @dataclasses.dataclass diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 707068b2bbdc..3a681d4603f8 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -45,8 +45,6 @@ ] vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 def quant_fp8_per_tensor_batches(a): diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index c27cf2468ede..0550c2d9e212 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -81,8 +81,6 @@ ] vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 def run_moe_test( diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index a2de64974b35..dd4eb4da913b 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -192,8 +192,6 @@ def pplx_cutlass_moe( vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 def _pplx_moe( diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 0f0ed3326d15..f671b23d300c 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -81,8 +81,6 @@ DTYPES = [torch.float8_e4m3fn, torch.bfloat16] vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 def torch_prepare( diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 933cd9dbdeaa..7a467e160b78 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -18,8 +18,6 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 55f092e7ea69..e9973c1fcc15 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -29,8 +29,6 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py index dabc10a122f7..310091b6a554 100644 --- a/tests/kernels/quantization/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -18,8 +18,6 @@ pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True) vllm_config = VllmConfig() -vllm_config.scheduler_config.max_num_seqs = 128 -vllm_config.scheduler_config.max_model_len = 8192 DTYPES = [torch.half, torch.bfloat16] M = [1, 33, 64, 222] diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 444568994a95..8194295ffedb 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -6,7 +6,7 @@ from dataclasses import InitVar from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast -from pydantic import Field, field_validator, model_validator +from pydantic import Field, field_validator from pydantic.dataclasses import dataclass from typing_extensions import Self, deprecated @@ -48,13 +48,6 @@ class SchedulerConfig: In real usage, this should be set in `EngineArgs.create_engine_config`. """ - max_model_len: int = Field(default=8192, ge=1) - """Maximum length of a sequence (including prompt and generated text). - - The default value here is mainly for convenience when testing. - In real usage, this should duplicate `ModelConfig.max_model_len` via - `EngineArgs`.""" - max_num_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.""" @@ -89,6 +82,12 @@ class SchedulerConfig: is_multimodal_model: bool = False """True if the model is multimodal.""" + max_model_len: InitVar[int] = 8192 + """Maximum length of a sequence (including prompt and generated text). + + Note: This is stored in the ModelConfig, and is used only here to + provide fallbacks and validate other attributes.""" + is_encoder_decoder: InitVar[bool] = False """True if the model is an encoder-decoder model. @@ -199,7 +198,7 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: return value return handler(value) - def __post_init__(self, is_encoder_decoder: bool) -> None: + def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None: if is_encoder_decoder: # Chunked prefill should be disabled for encoder-decoder models. self.disable_chunked_mm_input = True @@ -221,7 +220,7 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: if self.max_num_partial_prefills > 1: if self.long_prefill_token_threshold == 0: - self.long_prefill_token_threshold = int(self.max_model_len * 0.04) + self.long_prefill_token_threshold = int(max_model_len * 0.04) logger.info( "Concurrent partial prefills enabled with " @@ -232,6 +231,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: self.long_prefill_token_threshold, ) + self.verify_max_model_len(max_model_len) + @property @deprecated( "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " @@ -245,15 +246,14 @@ def chunked_prefill_enabled(self) -> bool: def chunked_prefill_enabled(self, value: bool): self.enable_chunked_prefill = value - @model_validator(mode="after") - def _verify_args(self) -> Self: + def verify_max_model_len(self, max_model_len: int) -> Self: if ( - self.max_num_batched_tokens < self.max_model_len + self.max_num_batched_tokens < max_model_len and not self.enable_chunked_prefill ): raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " - f"smaller than max_model_len ({self.max_model_len}). " + f"smaller than max_model_len ({max_model_len}). " "This effectively limits the maximum sequence length to " "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " @@ -267,12 +267,12 @@ def _verify_args(self) -> Self: f"({self.max_num_seqs})." ) - if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: + if self.max_num_batched_tokens > self.max_num_seqs * max_model_len: logger.warning( "max_num_batched_tokens (%d) exceeds max_num_seqs " "* max_model_len (%d). This may lead to unexpected behavior.", self.max_num_batched_tokens, - self.max_num_seqs * self.max_model_len, + self.max_num_seqs * max_model_len, ) if self.max_num_partial_prefills > 1: @@ -282,11 +282,11 @@ def _verify_args(self) -> Self: "max_num_partial_prefills > 1." ) - if self.long_prefill_token_threshold > self.max_model_len: + if self.long_prefill_token_threshold > max_model_len: raise ValueError( "long_prefill_token_threshold " f"({self.long_prefill_token_threshold}) cannot be greater " - f"than the max_model_len ({self.max_model_len})." + f"than the max_model_len ({max_model_len})." ) if self.max_long_partial_prefills > self.max_num_partial_prefills: diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 1e6e455210c8..bf9bcd0e8a11 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -929,7 +929,6 @@ def recalculate_max_model_len(self, max_model_len: int): model_config = self.model_config max_model_len = model_config.get_and_verify_max_len(max_model_len) self.model_config.max_model_len = max_model_len - self.scheduler_config.max_model_len = max_model_len def try_verify_and_update_config(self): if self.model_config is None: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 1da34629472c..ed655912d396 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -339,7 +339,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) vllm_config.scheduler_config.enable_chunked_prefill = False vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, + vllm_config.model_config.max_model_len, vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index c1218801bc07..944344a22957 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -191,7 +191,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) vllm_config.scheduler_config.enable_chunked_prefill = False vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, + vllm_config.model_config.max_model_len, vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index ad4beb28bdae..65516827a16d 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -185,7 +185,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) vllm_config.scheduler_config.enable_chunked_prefill = False vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, + vllm_config.model_config.max_model_len, vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c640c40a455d..bc15979dea62 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -83,7 +83,7 @@ def __init__( # Scheduling constraints. self.max_num_running_reqs = self.scheduler_config.max_num_seqs self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens - self.max_model_len = self.scheduler_config.max_model_len + self.max_model_len = vllm_config.model_config.max_model_len self.enable_kv_cache_events = ( self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events From 173b356abff3e2e547fc44c60361f3b0adc41aaf Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Sat, 15 Nov 2025 14:13:41 +0400 Subject: [PATCH 097/578] [PERF] Remove TRTLLM Gen attn kernel limitation `max_seq_len <=131072` (#28755) Signed-off-by: Vadim Gimpelson --- vllm/config/vllm.py | 15 --------------- vllm/utils/flashinfer.py | 6 ++---- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index bf9bcd0e8a11..87f6b6eed851 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -483,21 +483,6 @@ def __post_init__(self): "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - elif ( - current_platform.is_cuda() - and current_platform.is_device_capability(100) - and self.model_config.max_model_len > 131072 - and not self.model_config.use_mla - ): - # Refer to vllm/utils/flashinfer.py::use_trtllm_attention() - logger.warning_once( - "NVIDIA Blackwell TRTLLM attention cannot support " - "max_model_len >= 131072 (found " - f"{self.model_config.max_model_len}), causing dynamic " - "dispatching that breaks full cudagraphs. " - "Overriding cudagraph_mode to PIECEWISE." - ) - self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE # disable cudagraph when enforce eager execution if self.model_config is not None and self.model_config.enforce_eager: diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 79e5a4c30259..1209d64901bf 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -319,14 +319,12 @@ def use_trtllm_attention( # Environment variable not set - use auto-detection if is_prefill: # Prefill auto-detection - use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto" + use_trtllm = kv_cache_dtype == "auto" if use_trtllm: logger.warning_once("Using TRTLLM prefill attention (auto-detected).") else: # Decode auto-detection - use_trtllm = ( - num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto" - ) + use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto" if use_trtllm: logger.warning_once("Using TRTLLM decode attention (auto-detected).") return use_trtllm From f36292dbee27a5ebe0e7115c061b82f6f5372dcf Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Sat, 15 Nov 2025 03:46:12 -0800 Subject: [PATCH 098/578] [compile] Enable sequence parallelism matching w/o custom ops enabled (#27126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: angelayi Signed-off-by: Luka Govedič Signed-off-by: ProExpertProg Co-authored-by: Luka Govedič Co-authored-by: Luka Govedič Co-authored-by: Luka Govedič --- .buildkite/test-pipeline.yaml | 14 +- tests/compile/test_fusions_e2e.py | 228 ++++++++++-- tests/compile/test_sequence_parallelism.py | 262 +++++++------- tests/distributed/test_sequence_parallel.py | 15 +- vllm/compilation/sequence_parallelism.py | 369 ++++++-------------- vllm/config/vllm.py | 28 +- 6 files changed, 472 insertions(+), 444 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 52539728215b..723f311a2646 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -478,10 +478,11 @@ steps: - vllm/ - tests/compile commands: + # fp8 kv scales not supported on sm89, tested on Blackwell instead - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -925,7 +926,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion & Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -946,7 +947,9 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -969,8 +972,6 @@ steps: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1266,7 +1267,8 @@ steps: - pytest -v -s tests/compile/test_async_tp.py - pytest -v -s tests/compile/test_sequence_parallelism.py - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py index e1560efb3f24..f22d60ef000b 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/test_fusions_e2e.py @@ -20,13 +20,22 @@ from ..utils import flat_product, multi_gpu_test +is_blackwell = lambda: current_platform.is_device_capability(100) +"""Are we running on Blackwell, a lot of tests depend on it""" + + +class Matches(NamedTuple): + attention_fusion: int = 0 + allreduce_fusion: int = 0 + sequence_parallel: int = 0 + async_tp: int = 0 + class ModelBackendTestCase(NamedTuple): model_name: str model_kwargs: dict[str, Any] backend: AttentionBackendEnum - attention_fusions: int - allreduce_fusions: int | None = None + matches: Matches MODELS_FP8: list[ModelBackendTestCase] = [] @@ -38,17 +47,33 @@ class ModelBackendTestCase(NamedTuple): ModelBackendTestCase( # Use smaller model for L40s in CI model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - attention_fusions=32, - allreduce_fusions=65, + # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell + # so FI attention+fp8_quant is at least tested once + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=32, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), ), ModelBackendTestCase( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER, - attention_fusions=48, - allreduce_fusions=96, + # TODO FlashInfer attn broken on Hopper with kvcache=fp8: + # https://github.com/vllm-project/vllm/issues/28568 + # TODO FlashInfer attn broken on Blackwell for llama4: + # https://github.com/vllm-project/vllm/issues/28604 + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=48, + allreduce_fusion=96, + sequence_parallel=96, + async_tp=95, # mlp is moe, no fusion there + ), ), ] @@ -57,8 +82,12 @@ class ModelBackendTestCase(NamedTuple): model_name="nvidia/Llama-3.1-8B-Instruct-FP4", model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), backend=AttentionBackendEnum.FLASHINFER, - attention_fusions=32, - allreduce_fusions=65, + matches=Matches( + attention_fusion=32, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), ), ] @@ -68,15 +97,23 @@ class ModelBackendTestCase(NamedTuple): model_name="meta-llama/Llama-3.1-8B-Instruct", model_kwargs=dict(max_model_len=1024), backend=AttentionBackendEnum.TRITON_ATTN, - attention_fusions=0, - allreduce_fusions=65, + matches=Matches( + attention_fusion=0, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), ), ModelBackendTestCase( model_name="Qwen/Qwen3-30B-A3B", model_kwargs=dict(max_model_len=1024), backend=AttentionBackendEnum.TRITON_ATTN, - attention_fusions=0, - allreduce_fusions=97, + matches=Matches( + attention_fusion=0, + allreduce_fusion=97, + sequence_parallel=97, + async_tp=96, # MLP is MoE, half the fusions of dense + ), ), ] @@ -86,19 +123,19 @@ class ModelBackendTestCase(NamedTuple): model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", model_kwargs=dict(max_model_len=1024), backend=AttentionBackendEnum.TRITON_ATTN, - attention_fusions=32, + matches=Matches(attention_fusion=32), ), ModelBackendTestCase( model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", model_kwargs=dict(max_model_len=1024), backend=AttentionBackendEnum.ROCM_ATTN, - attention_fusions=32, + matches=Matches(attention_fusion=32), ), ModelBackendTestCase( model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", model_kwargs=dict(max_model_len=1024), backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, - attention_fusions=32, + matches=Matches(attention_fusion=32), ), ] @@ -106,8 +143,7 @@ class ModelBackendTestCase(NamedTuple): @pytest.mark.parametrize( - "model_name, model_kwargs, backend, " - "attention_fusions, allreduce_fusions, custom_ops", + "model_name, model_kwargs, backend, matches, custom_ops", # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8)) # quant_fp4 only has the custom impl @@ -118,15 +154,14 @@ def test_attn_quant( model_name: str, model_kwargs: dict[str, Any], backend: AttentionBackendEnum, - attention_fusions: int, - allreduce_fusions: int, + matches: Matches, custom_ops: str, inductor_graph_partition: bool, caplog_mp_spawn, monkeypatch, ): if backend == AttentionBackendEnum.FLASHINFER and ( - not current_platform.is_device_capability((10, 0)) or not has_flashinfer() + not is_blackwell() or not has_flashinfer() ): pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): @@ -169,12 +204,12 @@ def test_attn_quant( with caplog_mp_spawn(logging.DEBUG) as log_holder: run_model(compilation_config, model_name, **model_kwargs) - matches = re.findall( + log_matches = re.findall( r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", log_holder.text, ) - assert len(matches) == 1, log_holder.text - assert int(matches[0]) == attention_fusions + assert len(log_matches) == 1, log_holder.text + assert int(log_matches[0]) == matches.attention_fusion CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"] @@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize( - "model_name, model_kwargs, backend, " - "attention_fusions, allreduce_fusions, custom_ops", + "model_name, model_kwargs, backend, matches, custom_ops", # Toggle RMSNorm and QuantFP8 for FP8 models list( flat_product( @@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm( model_name: str, model_kwargs: dict, backend: AttentionBackendEnum, - attention_fusions: int, - allreduce_fusions: int, + matches: Matches, custom_ops: str, inductor_graph_partition: bool, caplog_mp_spawn, @@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm( if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("Inductor graph partition requires torch>=2.9") + if "fp4" in model_name.lower() and not is_blackwell(): + pytest.skip("NVFP4 quant requires Blackwell") + + if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell(): + # FlashInfer attn fusion requires Blackwell + matches = matches._replace(attention_fusion=0) + custom_ops_list = custom_ops.split(",") if custom_ops else [] if inductor_graph_partition: @@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm( run_model( compilation_config, model_name, tensor_parallel_size=2, **model_kwargs ) - matches = re.findall( + log_matches = re.findall( + r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", + log_holder.text, + ) + assert len(log_matches) == 2, log_holder.text + + assert int(log_matches[0]) == matches.attention_fusion + assert int(log_matches[1]) == matches.attention_fusion + + log_matches = re.findall( + r"collective_fusion.py:\d+] Replaced (\d+) patterns", + log_holder.text, + ) + assert len(log_matches) == 2, log_holder.text + + assert int(log_matches[0]) == matches.allreduce_fusion + assert int(log_matches[1]) == matches.allreduce_fusion + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, model_kwargs, backend, matches, custom_ops", + # Toggle RMSNorm and QuantFP8 for FP8 models + list( + flat_product( + MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM) + ) + ) + # Toggle RMSNorm for FP4 models and unquant models + + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)), +) +@pytest.mark.parametrize("inductor_graph_partition", [True, False]) +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="sequence parallel only tested on CUDA", +) +def test_tp2_attn_quant_async_tp( + model_name: str, + model_kwargs: dict, + backend: AttentionBackendEnum, + matches: Matches, + custom_ops: str, + inductor_graph_partition: bool, + caplog_mp_spawn, + monkeypatch, +): + if is_blackwell(): + # TODO: https://github.com/vllm-project/vllm/issues/27893 + pytest.skip("Blackwell is not supported for AsyncTP pass") + + if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition requires torch>=2.9") + + if "fp4" in model_name.lower() and not is_blackwell(): + pytest.skip("NVFP4 quant requires Blackwell") + + if backend == AttentionBackendEnum.FLASHINFER: + if not has_flashinfer(): + pytest.skip("FlashInfer backend requires flashinfer installed") + if not is_blackwell(): + # FlashInfer attn fusion requires Blackwell + matches = matches._replace(attention_fusion=0) + + custom_ops_list = custom_ops.split(",") if custom_ops else [] + + if inductor_graph_partition: + mode = CUDAGraphMode.FULL_AND_PIECEWISE + splitting_ops: list[str] | None = None + else: + mode = CUDAGraphMode.FULL_DECODE_ONLY + splitting_ops = [] + + # Disable, compile cache to make sure custom passes run. + # Otherwise, we can't verify fusion happened through the logs. + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + # To capture subprocess logs, we need to know whether spawn or fork is used. + # Force spawn as it is more general. + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + compilation_config = CompilationConfig( + # Testing properties + use_inductor_graph_partition=inductor_graph_partition, + cudagraph_mode=mode, + custom_ops=custom_ops_list, + splitting_ops=splitting_ops, + # Common + level=CompilationMode.VLLM_COMPILE, + pass_config=PassConfig( + enable_attn_fusion=True, + enable_noop=True, + enable_sequence_parallelism=True, + enable_async_tp=True, + ), + # Inductor caches custom passes by default as well via uuid + inductor_compile_config={"force_disable_caches": True}, + ) + + with caplog_mp_spawn(logging.DEBUG) as log_holder: + run_model( + compilation_config, model_name, tensor_parallel_size=2, **model_kwargs + ) + log_matches = re.findall( r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", log_holder.text, ) - assert len(matches) == 2, log_holder.text + assert len(log_matches) == 2, log_holder.text + + assert int(log_matches[0]) == matches.attention_fusion + assert int(log_matches[1]) == matches.attention_fusion + + log_matches = re.findall( + r"sequence_parallelism.py:\d+] Replaced (\d+) patterns", + log_holder.text, + ) + assert len(log_matches) == 2, log_holder.text - assert int(matches[0]) == attention_fusions - assert int(matches[1]) == attention_fusions + assert int(log_matches[0]) == matches.sequence_parallel + assert int(log_matches[1]) == matches.sequence_parallel - matches = re.findall( + log_matches = re.findall( r"collective_fusion.py:\d+] Replaced (\d+) patterns", log_holder.text, ) - assert len(matches) == 2, log_holder.text + assert len(log_matches) == 2, log_holder.text - assert int(matches[0]) == allreduce_fusions - assert int(matches[1]) == allreduce_fusions + assert int(log_matches[0]) == matches.async_tp + assert int(log_matches[1]) == matches.async_tp def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index e909cf7393ad..9cd7f64b04af 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -5,15 +5,15 @@ import torch import vllm.envs as envs -from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.compilation.fusion import RMSNormQuantFusionPass -from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func +from vllm.compilation.fx_utils import find_auto_fn from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.sequence_parallelism import SequenceParallelismPass from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import ( CompilationConfig, + CUDAGraphMode, DeviceConfig, ModelConfig, PassConfig, @@ -27,6 +27,7 @@ initialize_model_parallel, ) from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables @@ -43,172 +44,157 @@ ] -class TestModel(torch.nn.Module): - def __init__(self, hidden_size=16, intermediate_size=32): +class TestAllReduceRMSNormModel(torch.nn.Module): + def __init__(self, hidden_size=16, eps=1e-6): super().__init__() self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.gate_proj = torch.nn.Parameter( - torch.empty((intermediate_size, hidden_size)) - ) - self.norm = RMSNorm(intermediate_size, 1e-05) - # Initialize weights - torch.nn.init.normal_(self.gate_proj, std=0.02) + self.eps = eps + self.norm = [RMSNorm(hidden_size, eps) for i in range(4)] + self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)] - def forward(self, hidden_states, residual): - """ - Forward pass implementing the operations in the FX graph + def forward(self, x): + z = torch.relu(x) + x = resid = tensor_model_parallel_all_reduce(z) + y = self.norm[0](x) - Args: - hidden_states: Input tensor - residual: Residual tensor from previous layer + z2 = torch.mm(y, self.w[0]) + x2 = tensor_model_parallel_all_reduce(z2) - Returns: - Tuple containing the output tensor - """ - # Reshape input - view = hidden_states.reshape(-1, self.hidden_size) + y2, resid = self.norm[1](x2, resid) - # matrix multiplication - permute = self.gate_proj.permute(1, 0) - mm = torch.mm(view, permute) + z3 = torch.mm(y2, self.w[1]) + x3 = tensor_model_parallel_all_reduce(z3) - # Tensor parallel all-reduce - all_reduce = tensor_model_parallel_all_reduce(mm) + y3, resid = self.norm[2](x3, resid) - # layer normalization - norm_output, residual_output = self.norm(all_reduce, residual) + z4 = torch.mm(y3, self.w[2]) + x4 = tensor_model_parallel_all_reduce(z4) - return norm_output, residual_output + y4, resid = self.norm[3](x4, resid) + return y4 def ops_in_model_before(self): return [torch.ops.vllm.all_reduce.default] def ops_in_model_after(self): return [ - torch.ops.vllm.reduce_scatter.default, torch.ops.vllm.all_gather.default, + torch.ops.vllm.reduce_scatter.default, ] def ops_in_model(self): - return [torch.ops._C.fused_add_rms_norm.default] + if RMSNorm.enabled(): + return [ + torch.ops._C.rms_norm.default, + torch.ops._C.fused_add_rms_norm.default, + ] + else: + return [] -class TestQuantModel(torch.nn.Module): - def __init__(self, hidden_size=16, intermediate_size=32): +class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module): + def __init__(self, hidden_size=16, eps=1e-6): super().__init__() - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size self.vllm_config = get_current_vllm_config() - self.gate_proj = torch.nn.Parameter( - torch.empty((intermediate_size, hidden_size)), requires_grad=False + self.hidden_size = hidden_size + self.eps = eps + self.norm = [RMSNorm(hidden_size, eps) for i in range(4)] + self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] + self.w = [ + torch.rand(hidden_size, hidden_size) + .to(dtype=current_platform.fp8_dtype()) + .t() + for _ in range(3) + ] + + self.fp8_linear = Fp8LinearOp( + act_quant_static=True, + act_quant_group_shape=GroupShape.PER_TENSOR, ) - self.norm = RMSNorm(intermediate_size, 1e-05) - # Initialize weights - torch.nn.init.normal_(self.gate_proj, std=0.02) - - self.fp8_linear = Fp8LinearOp(act_quant_static=True) - - self.scale = torch.rand(1, dtype=torch.float32) - # Create a weight that is compatible with torch._scaled_mm, - # which expects a column-major layout. - self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t() - self.wscale = torch.rand(1, dtype=torch.float32) - - def forward(self, hidden_states, residual): - """ - Forward pass implementing the operations in the FX graph - - Args: - hidden_states: Input tensor - residual: Residual tensor from previous layer - - Returns: - Tuple containing the output tensor - """ - # Reshape input - view = hidden_states.reshape(-1, self.hidden_size) - - # matrix multiplication - permute = self.gate_proj.permute(1, 0) - mm = torch.mm(view, permute) - - # Tensor parallel all-reduce - all_reduce = tensor_model_parallel_all_reduce(mm) - - # layer normalization - norm_output, residual_output = self.norm(all_reduce, residual) - - # scaled_mm with static input quantization - fp8_linear_result = self.fp8_linear.apply( - norm_output, - self.w, - self.wscale, - input_scale=self.scale.to(norm_output.device), + + self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] + + def forward(self, hidden_states): + # avoid having graph input be an arg to a pattern directly + z = torch.relu(hidden_states) + x = resid = tensor_model_parallel_all_reduce(z) + y = self.norm[0](x) + + z2 = self.fp8_linear.apply( + y, self.w[0], self.wscale[0], input_scale=self.scale[0] ) - return fp8_linear_result, residual_output + x2 = tensor_model_parallel_all_reduce(z2) + y2, resid = self.norm[1](x2, resid) - def ops_in_model_before(self): - ops_to_remove = [torch.ops.vllm.all_reduce.default] # Always removed by SP - # The following are only removed if fusion happens - if ( - self.vllm_config - and self.vllm_config.compilation_config.pass_config.enable_fusion - ): - ops_to_remove.extend( - [ - torch.ops._C.fused_add_rms_norm.default, - torch.ops._C.static_scaled_fp8_quant.default, - ] - ) - return ops_to_remove + z3 = self.fp8_linear.apply( + y2, self.w[1], self.wscale[1], input_scale=self.scale[1] + ) + + x3 = tensor_model_parallel_all_reduce(z3) + y3, resid = self.norm[2](x3, resid) # use resid here + + z4 = self.fp8_linear.apply( + y3, self.w[2], self.wscale[2], input_scale=self.scale[2] + ) + x4 = tensor_model_parallel_all_reduce(z4) + y4, resid = self.norm[3](x4, resid) # use resid here + return y4 def ops_in_model_after(self): - ops_to_add = [ - torch.ops.vllm.reduce_scatter.default, + return [ torch.ops.vllm.all_gather.default, + torch.ops.vllm.reduce_scatter.default, + ] + + def ops_in_model_before(self): + return [ + torch.ops.vllm.all_reduce.default, ] - # The following is only added if fusion happens - if ( - self.vllm_config - and self.vllm_config.compilation_config.pass_config.enable_fusion - ): - ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default) - return ops_to_add def ops_in_model(self): - if ( - self.vllm_config - and self.vllm_config.compilation_config.pass_config.enable_fusion - ): - # If fusion happens, the fused op is the one - # we check for (de)functionalization + if self.vllm_config.compilation_config.pass_config.enable_fusion: return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] - else: - # If no fusion, the original ops are checked + elif RMSNorm.enabled(): return [ torch.ops._C.fused_add_rms_norm.default, - # TODO functionalization pass does not handle this yet - # torch.ops._C.static_scaled_fp8_quant.default, ] + elif self.fp8_linear.quant_fp8.enabled(): + return [ + torch.ops._C.static_scaled_fp8_quant.default, + ] + else: + return [] @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel]) +@pytest.mark.parametrize( + "test_model_cls, custom_ops", + [ + (TestAllReduceRMSNormModel, "+rms_norm"), + (TestAllReduceRMSNormModel, "-rms_norm"), + (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"), + (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"), + (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"), + (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"), + ], +) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [16]) @pytest.mark.parametrize("hidden_size", [16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("enable_fusion", [True, False]) +@pytest.mark.parametrize("dynamic", [False, True]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") def test_sequence_parallelism_pass( test_model_cls: type[torch.nn.Module], + custom_ops: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype, enable_fusion: bool, + dynamic: bool, ): num_processes = 2 @@ -220,11 +206,13 @@ def run_torch_spawn(fn, nprocs): args=( num_processes, test_model_cls, + custom_ops, batch_size, seq_len, hidden_size, dtype, enable_fusion, + dynamic, ), nprocs=nprocs, ) @@ -236,11 +224,13 @@ def sequence_parallelism_pass_on_test_model( local_rank: int, world_size: int, test_model_cls: type[torch.nn.Module], + custom_ops: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype, enable_fusion: bool, + dynamic: bool, ): current_platform.seed_everything(0) @@ -264,12 +254,16 @@ def sequence_parallelism_pass_on_test_model( initialize_model_parallel(tensor_model_parallel_size=world_size) # configure vllm config for SequenceParallelismPass + custom_ops_list = custom_ops.split(",") if custom_ops else [] compilation_config = CompilationConfig( + splitting_ops=[], # avoid automatic rms_norm enablement + cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings + custom_ops=custom_ops_list, pass_config=PassConfig( enable_sequence_parallelism=True, enable_fusion=enable_fusion, enable_noop=True, - ) + ), ) # NoOp needed for fusion device_config = DeviceConfig(device=torch.device("cuda")) @@ -289,7 +283,6 @@ def sequence_parallelism_pass_on_test_model( with set_current_vllm_config(vllm_config): noop_pass = NoOpEliminationPass(vllm_config) sequence_parallelism_pass = SequenceParallelismPass(vllm_config) - func_pass = FixFunctionalizationPass(vllm_config) cleanup_pass = PostCleanupPass(vllm_config) assert ( sequence_parallelism_pass.compilation_config.splitting_ops @@ -310,38 +303,29 @@ def sequence_parallelism_pass_on_test_model( passes_for_backend.append(cleanup_pass) - backend_no_func = TestBackend(*passes_for_backend) - backend_func = TestBackend(*passes_for_backend, func_pass) + backend = TestBackend(*passes_for_backend) - model = test_model_cls(hidden_size, hidden_size * 2) + model = test_model_cls(hidden_size) hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype) - residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype) - compiled_model_no_func = torch.compile(model, backend=backend_no_func) - compiled_model_no_func(hidden_states, residual) - compiled_model_func = torch.compile(model, backend=backend_func) - compiled_model_func(hidden_states, residual) + if dynamic: + torch._dynamo.mark_dynamic(hidden_states, 0) + + compiled_model = torch.compile(model, backend=backend) + compiled_model(hidden_states) - assert sequence_parallelism_pass.matched_count == 1 + assert sequence_parallelism_pass.matched_count == 4 # In pre-nodes, all reduce should be there, # reduce scatter and all gather should not - backend_no_func.check_before_ops(model.ops_in_model_before()) + for op in model.ops_in_model_before(): + assert backend.op_count(op, before=True) == 4 # In post-nodes, reduce scatter and all gather should be there, # all reduce should not - backend_no_func.check_after_ops(model.ops_in_model_after()) + for op in model.ops_in_model_after(): + assert backend.op_count(op, before=False) == 4 - # check if the functionalization pass is applied for op in model.ops_in_model(): - find_auto_fn(backend_no_func.graph_post_pass.nodes, op) - assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None - - # make sure the ops were all de-functionalized - found = dict() - for node in backend_func.graph_post_pass.nodes: - for op in model.ops_in_model(): - if is_func(node, op): - found[op] = True - assert all(found[op] for op in model.ops_in_model()) + find_auto_fn(backend.graph_post_pass.nodes, op) diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 94b2b51211a6..f38c509775ed 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -18,6 +18,7 @@ from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer from ..models.registry import HF_EXAMPLE_MODELS @@ -161,6 +162,7 @@ def _compare_sp( test_options: SPTestOptions, num_gpus_available: int, use_inductor_graph_partition: bool, + enable_async_tp: bool, *, method: Literal["generate", "encode"], is_multimodal: bool, @@ -244,10 +246,10 @@ def _compare_sp( compilation_config = { "mode": CompilationMode.VLLM_COMPILE, - "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { "enable_sequence_parallelism": True, + "enable_async_tp": enable_async_tp, "enable_fusion": enable_fusion, "enable_noop": True, }, @@ -307,6 +309,7 @@ def _compare_sp( ], ) @pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP @create_new_process_for_each_test() def test_tp_sp_generation( model_id: str, @@ -316,10 +319,19 @@ def test_tp_sp_generation( test_options: SPTestOptions, num_gpus_available, use_inductor_graph_partition: bool, + enable_async_tp: bool, ): if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + # Skip FP8 SP-only test on sm89 (compute capability 8.9) + if ( + "fp8" in model_id.lower() + and current_platform.get_device_capability() < (9, 0) + and (not enable_async_tp) + ): + pytest.skip("FP8 reduction support begins with sm90 capable devices.") + _compare_sp( model_id, parallel_setup, @@ -328,6 +340,7 @@ def test_tp_sp_generation( test_options, num_gpus_available, use_inductor_graph_partition, + enable_async_tp=enable_async_tp, method="generate", is_multimodal=False, ) diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 31624a8fdcc0..bb4dcf12d865 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools + import torch import torch._inductor.pattern_matcher as pm import torch.fx as fx @@ -10,98 +12,28 @@ from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + kFp8StaticTensorSym, +) from vllm.platforms import current_platform from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm +from .noop_elimination import NoOpEliminationPass from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) -class _RMSNormAndQuantOpHelper: - """Base helper for RMSNorm and RMSNorm + Quantization functionalization.""" +def get_first_out_wrapper(fn): + @functools.wraps(fn) + def wrapper(*args): + return fn(*args)[0] - def __init__( - self, - epsilon: float, - dtype: torch.dtype, - device: str, - quant_op: torch._ops.OpOverload | None = None, - **kwargs, - ): - self.epsilon = epsilon - self.dtype = dtype - self.device = device - self.quant_op = quant_op - - def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor): - return torch.ops.higher_order.auto_functionalized( - torch.ops._C.rms_norm.default, - result=result_buffer, - input=input_tensor, - weight=weight_tensor, - epsilon=self.epsilon, - ) + return wrapper - def _functional_fused_add_rmsnorm( - self, input_tensor, residual_tensor, weight_tensor - ): - return torch.ops.higher_order.auto_functionalized( - torch.ops._C.fused_add_rms_norm.default, - input=input_tensor, - residual=residual_tensor, - weight=weight_tensor, - epsilon=self.epsilon, - ) - def _functional_rmsnorm_then_quant( - self, - rmsnorm_result_buffer, - quant_result_buffer, - input_tensor, - weight_tensor, - scale_tensor, - ): - if self.quant_op is None: - raise RuntimeError( - "_RMSNormAndQuantOpHelper was not initialized with a quant_op." - ) - rmsnorm_out_tuple = self._functional_rmsnorm( - rmsnorm_result_buffer, input_tensor, weight_tensor - ) - quant_out_tuple = torch.ops.higher_order.auto_functionalized( - self.quant_op, - result=quant_result_buffer, - input=rmsnorm_out_tuple[1], - scale=scale_tensor, - ) - return quant_out_tuple - - def _functional_fused_add_rmsnorm_then_quant( - self, - quant_result_buffer, - input_tensor, - residual_tensor, - weight_tensor, - scale_tensor, - ): - if self.quant_op is None: - raise RuntimeError( - "_RMSNormAndQuantOpHelper was not initialized with a quant_op." - ) - fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm( - input_tensor, residual_tensor, weight_tensor - ) - quant_out_tuple = torch.ops.higher_order.auto_functionalized( - self.quant_op, - result=quant_result_buffer, - input=fused_add_rmsnorm_out_tuple[1], - scale=scale_tensor, - ) - return quant_out_tuple, fused_add_rmsnorm_out_tuple[2] - - -class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper): +class _SequenceParallelPatternHelper: """Helper for sequence parallelism patterns.""" def __init__( @@ -109,10 +41,10 @@ def __init__( epsilon: float, dtype: torch.dtype, device: str, - quant_op: torch._ops.OpOverload | None = None, - **kwargs, ): - super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs) + self.epsilon = epsilon + self.dtype = dtype + self.device = device self.tp_group = get_tp_group() self.tp_size = get_tensor_model_parallel_world_size() @@ -131,36 +63,34 @@ def _all_gather(self, x: torch.Tensor) -> torch.Tensor: class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper): + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + def get_inputs(self): input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype) - permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype) arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype) - return [input, permute, arg3_1] + return [input, arg3_1] def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, - permute: torch.Tensor, arg3_1: torch.Tensor, ): all_reduce = self._all_reduce(input) - rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1) + rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1) - return rmsnorm[1], all_reduce + return rmsnorm, all_reduce def replacement( input: torch.Tensor, - permute: torch.Tensor, arg3_1: torch.Tensor, ): reduce_scatter = self._reduce_scatter(input) - rmsnorm_result = torch.empty_like(reduce_scatter) - rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter, arg3_1) - - all_gather = self._all_gather(rmsnorm[1]) - + rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1) + all_gather = self._all_gather(rmsnorm) return all_gather, reduce_scatter pm.register_replacement( @@ -169,6 +99,10 @@ def replacement( class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper): + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + def get_inputs(self): mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) @@ -188,67 +122,34 @@ def pattern( rms_norm_weights: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: all_reduce = self._all_reduce(mm_1) - rmsnorm = self._functional_fused_add_rmsnorm( - all_reduce, residual, rms_norm_weights - ) - return rmsnorm[1], rmsnorm[2] + rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual) + return rmsnorm[0], rmsnorm[1] def replacement( residual: torch.Tensor, mm_1: torch.Tensor, rms_norm_weights: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: + # pattern matcher replaces from top-to-bottom, + # so residual is still the full size here. + # once the seqpar pattern with the previous rmsnorm is replaced reduce_scatter = self._reduce_scatter(mm_1) - rmsnorm = self._functional_fused_add_rmsnorm( - reduce_scatter, residual, rms_norm_weights - ) - all_gather = self._all_gather(rmsnorm[1]) - return all_gather, rmsnorm[2] + residual = residual[0 : reduce_scatter.size(0), ...] + rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual) + all_gather = self._all_gather(rmsnorm[0]) + # shape of residual changes but that's fine, + # next node is already slicing it, now becomes a noop + return all_gather, rmsnorm[1] pm.register_replacement( pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass ) - - -class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper): - def get_inputs(self): - mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) - - residual = torch.empty([4, 4], device=self.device, dtype=self.dtype) - rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype) - - return [ - residual, - mm_1, - rms_norm_weights, - ] - - def register(self, pm_pass: PatternMatcherPass): - def pattern( - residual: torch.Tensor, - mm_1: torch.Tensor, - rms_norm_weights: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - all_reduce = self._all_reduce(mm_1) - rmsnorm = self._functional_fused_add_rmsnorm( - all_reduce, residual, rms_norm_weights - ) - return rmsnorm[1] - - def replacement( - residual: torch.Tensor, - mm_1: torch.Tensor, - rms_norm_weights: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - reduce_scatter = self._reduce_scatter(mm_1) - rmsnorm = self._functional_fused_add_rmsnorm( - reduce_scatter, residual, rms_norm_weights - ) - normalized = self._all_gather(rmsnorm[1]) - return normalized - pm.register_replacement( - pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + get_first_out_wrapper(pattern), + get_first_out_wrapper(replacement), + self.get_inputs(), + pm.fwd_only, + pm_pass, ) @@ -257,52 +158,41 @@ def replacement( class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): def __init__( - self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload + self, + epsilon: float, + dtype: torch.dtype, + device: str, ): - super().__init__(epsilon, dtype, device, quant_op=op) + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) def get_inputs(self): input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype) - rmsnorm_result = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype) - quant_result = torch.empty([1, 8, 4], device=self.device, dtype=FP8_DTYPE) weight = torch.empty([4], device=self.device, dtype=self.dtype) scale = torch.tensor(1.0, device=self.device, dtype=torch.float32) - return [input, rmsnorm_result, quant_result, weight, scale] + return [input, weight, scale] def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, - rmsnorm_result: torch.Tensor, - quant_result: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor, ): all_reduce = self._all_reduce(input) - static_fp8 = self._functional_rmsnorm_then_quant( - rmsnorm_result, quant_result, all_reduce, weight, scale - ) - return static_fp8[1], all_reduce + rms = self.rmsnorm_matcher(all_reduce, weight) + quant, _ = self.quant_matcher(rms, scale) + return quant, all_reduce def replacement( input: torch.Tensor, - rmsnorm_result: torch.Tensor, - quant_result: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor, ): reduce_scatter = self._reduce_scatter(input) - - rmsnorm_result = torch.empty_like( - reduce_scatter, dtype=rmsnorm_result.dtype - ) - quant_result = torch.empty_like( - rmsnorm_result, # Output of RMSNorm - dtype=quant_result.dtype, - ) - static_fp8 = self._functional_rmsnorm_then_quant( - rmsnorm_result, quant_result, reduce_scatter, weight, scale - ) - all_gather = self._all_gather(static_fp8[1]) + rms = self.rmsnorm_matcher(reduce_scatter, weight) + quant, _ = self.quant_matcher(rms, scale) + all_gather = self._all_gather(quant) return all_gather, reduce_scatter @@ -312,118 +202,64 @@ def replacement( class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): - def __init__( - self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload - ): - super().__init__(epsilon, dtype, device, quant_op=op) + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) def get_inputs(self): mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) - residual = torch.empty([4, 4], device=self.device, dtype=self.dtype) rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype) - result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE) scale = torch.empty([1, 1], device=self.device, dtype=torch.float32) - return [ - result, - residual, - mm_1, - rms_norm_weights, - scale, - ] + return [residual, mm_1, rms_norm_weights, scale] def register(self, pm_pass: PatternMatcherPass): def pattern( - result: torch.Tensor, residual: torch.Tensor, mm_1: torch.Tensor, rms_norm_weights: torch.Tensor, scale: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: all_reduce = self._all_reduce(mm_1) - static_fp8, rmsnorm_residual_out = ( - self._functional_fused_add_rmsnorm_then_quant( # noqa: E501 - result, all_reduce, residual, rms_norm_weights, scale - ) + rms, residual_out = self.rmsnorm_matcher( + all_reduce, rms_norm_weights, residual ) - return static_fp8[1], rmsnorm_residual_out + quant, _ = self.quant_matcher(rms, scale) + return quant, residual_out def replacement( - result: torch.Tensor, residual: torch.Tensor, mm_1: torch.Tensor, rms_norm_weights: torch.Tensor, scale: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: + # pattern matcher replaces from top-to-bottom, + # so residual is still the full size here. + # add a temporary slice which will become a noop + # once the seqpar pattern with the previous rmsnorm is replaced reduce_scatter = self._reduce_scatter(mm_1) - quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype) - static_fp8, rmsnorm_residual_out = ( - self._functional_fused_add_rmsnorm_then_quant( # noqa: E501 - quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale - ) + residual = residual[0 : reduce_scatter.size(0), ...] + rms, residual_out = self.rmsnorm_matcher( + reduce_scatter, rms_norm_weights, residual ) - all_gather = self._all_gather(static_fp8[1]) - return all_gather, rmsnorm_residual_out + quant, _ = self.quant_matcher(rms, scale) + all_gather = self._all_gather(quant) + # shape of residual changes but that's fine, + # next node is already slicing it, now becomes a noop + return all_gather, residual_out pm.register_replacement( pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass ) - -class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): - def __init__( - self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload - ): - super().__init__(epsilon, dtype, device, quant_op=op) - - def get_inputs(self): - mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) - - residual = torch.empty([4, 4], device=self.device, dtype=self.dtype) - rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype) - result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE) - scale = torch.empty([1, 1], device=self.device, dtype=torch.float32) - - return [ - result, - residual, - mm_1, - rms_norm_weights, - scale, - ] - - def register(self, pm_pass: PatternMatcherPass): - def pattern( - result: torch.Tensor, - residual: torch.Tensor, - mm_1: torch.Tensor, - rms_norm_weights: torch.Tensor, - scale: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - all_reduce = self._all_reduce(mm_1) - static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant( - result, all_reduce, residual, rms_norm_weights, scale - ) - return static_fp8[1] - - def replacement( - result: torch.Tensor, - residual: torch.Tensor, - mm_1: torch.Tensor, - rms_norm_weights: torch.Tensor, - scale: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - reduce_scatter = self._reduce_scatter(mm_1) - quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype) - static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant( - quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale - ) - normalized = self._all_gather(static_fp8[1]) - return normalized - pm.register_replacement( - pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + get_first_out_wrapper(pattern), + get_first_out_wrapper(replacement), + self.get_inputs(), + pm.fwd_only, + pm_pass, ) @@ -445,27 +281,45 @@ class SequenceParallelismPass(VllmPatternMatcherPass): GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can significantly reduce communication overhead and improve overall model performance. + + + This pass splits up the residual tensor across TP ranks and hence divides its size. + Because the pattern matcher starts at the end of the graph, the replacement + contains a slice that temporarily conforms the input residual to the correct size. + After all patterns have been matched, we use a NoOpEliminationPass to clean up + what have now become no-op slices. + + Note that an older version of the pass did not need this as it operated only on + custom rms_norm and fused_rms_norm_add custom ops which did not complain about + mismatched shapes during replacement. So this approach has the same assumption that + correctness is only maintained if all rms_norm operations are split across ranks. + + Correctness-wise, this is approach strictly better than before - before, + the graph was incorrect semantically and shape-wise during the pass. + With this approach there's only semantic incorrectness during the pass. + Both approaches restore a correct graph once all patterns are matched. """ @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) + # Used to cleanup redundant views created temporarily + # to circumvent residual shape change issues + self.noop_cleanup = NoOpEliminationPass(config) + self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}" + self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="sequence_parallelism_pass" ) for epsilon in [1e-5, 1e-6]: # RMSNorm + Static FP8 quantization patterns - fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default FirstAllReduceRMSNormStaticFP8Pattern( - epsilon, self.model_dtype, self.device, fp8_quant_op + epsilon, self.model_dtype, self.device ).register(self.patterns) MiddleAllReduceRMSNormStaticFP8Pattern( - epsilon, self.model_dtype, self.device, fp8_quant_op - ).register(self.patterns) - LastAllReduceRMSNormStaticFP8Pattern( - epsilon, self.model_dtype, self.device, fp8_quant_op + epsilon, self.model_dtype, self.device ).register(self.patterns) # Normal RMSNorm patterns @@ -477,9 +331,6 @@ def __init__(self, config: VllmConfig): epsilon, self.model_dtype, self.device ).register(self.patterns) - LastAllReduceRMSNormPattern( - epsilon, self.model_dtype, self.device - ).register(self.patterns) self.dump_patterns(config, self.patterns) def is_applicable(self, shape: int | None) -> bool: @@ -508,3 +359,5 @@ def is_applicable(self, shape: int | None) -> bool: def __call__(self, graph: fx.Graph): self.matched_count = self.patterns.apply(graph) logger.debug("Replaced %s patterns", self.matched_count) + # Clean up reshape nodes + self.noop_cleanup(graph) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 87f6b6eed851..bd98be48588f 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -445,8 +445,6 @@ def __post_init__(self): # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: self.compilation_config.pass_config.enable_sequence_parallelism = True - if self.compilation_config.pass_config.enable_sequence_parallelism: - self.compilation_config.custom_ops.append("+rms_norm") if current_platform.support_static_graph_mode(): # if cudagraph_mode is not explicitly set by users, set default @@ -620,6 +618,32 @@ def __post_init__(self): if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: self.compilation_config.set_splitting_ops_for_v1() + if self.compilation_config.pass_config.enable_sequence_parallelism: + # With pipeline parallelism or dynamo partitioning, + # native rms norm tracing errors due to incorrect residual shape. + # Use custom rms norm to unblock. In the future, + # the pass will operate on higher-level IR to avoid the issue. + # TODO: https://github.com/vllm-project/vllm/issues/27894 + is_fullgraph = ( + self.compilation_config.use_inductor_graph_partition + or len(self.compilation_config.splitting_ops) == 0 + ) + if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph: + if "-rms_norm" not in self.compilation_config.custom_ops: + self.compilation_config.custom_ops.append("+rms_norm") + else: + regime = ( + "Dynamo partition" + if not is_fullgraph + else "pipeline parallelism" + ) + logger.warning_once( + "Sequence parallelism not supported with" + "native rms_norm when using %s, " + "this will likely lead to an error.", + regime, + ) + # final check of cudagraph mode after all possible updates if current_platform.is_cuda_alike(): if ( From cb15ee28db037cff93a32aa237c862fc949824ce Mon Sep 17 00:00:00 2001 From: tingtinggithub Date: Sat, 15 Nov 2025 04:18:08 -0800 Subject: [PATCH 099/578] Allow Gemma3 to take image embeddings (#28483) Signed-off-by: tingtinggithub --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/gemma3_mm.py | 77 ++++++++++++++++++------- vllm/multimodal/parse.py | 11 ++-- vllm/v1/engine/processor.py | 8 ++- 4 files changed, 69 insertions(+), 29 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9cdf644c3cc5..6eb0947fe568 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -669,7 +669,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I+ | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ | | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | -| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma3ForConditionalGeneration` | Gemma 3 | T + IE+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 02fb7ef31dc9..8e2bbe8f7990 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Any, Literal +from typing import Annotated, Any, Literal, TypeAlias import torch from torch import nn @@ -20,7 +20,12 @@ MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ( + ImageEmbeddingItems, + ImageProcessorItems, + ImageSize, + MultiModalDataItems, +) from vllm.multimodal.processing import ( BaseMultiModalProcessor, BaseProcessingInfo, @@ -71,7 +76,15 @@ class Gemma3ImagePixelInputs(TensorSchema): num_patches: Annotated[torch.Tensor, TensorShape("bn")] -Gemma3ImageInputs = Gemma3ImagePixelInputs +class Gemma3ImageEmbeddingInputs(TensorSchema): + type: Literal["image_embeds"] = "image_embeds" + image_embeds: Annotated[ + torch.Tensor, + TensorShape("ni", "nf", "hs"), + ] + + +Gemma3ImageInputs: TypeAlias = Gemma3ImagePixelInputs | Gemma3ImageEmbeddingInputs class Gemma3ProcessingInfo(BaseProcessingInfo): @@ -178,8 +191,9 @@ def get_num_crops( def get_image_repl( self, *, - image_width: int, - image_height: int, + image_width: int | None, + image_height: int | None, + num_crops: int | None = None, processor: Gemma3Processor | None, ) -> PromptUpdateDetails[str]: if processor is None: @@ -187,11 +201,13 @@ def get_image_repl( boi_token = processor.boi_token - num_crops = self.get_num_crops( - image_width=image_width, - image_height=image_height, - processor=processor, - ) + if num_crops is None: + assert image_width is not None and image_height is not None + num_crops = self.get_num_crops( + image_width=image_width, + image_height=image_height, + processor=processor, + ) if num_crops == 0: image_text = boi_token @@ -321,6 +337,7 @@ def _get_mm_fields_config( return dict( pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches), num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), ) def _get_prompt_updates( @@ -333,7 +350,19 @@ def _get_prompt_updates( image_token = hf_processor.boi_token def get_replacement_gemma3(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems) + ) + + if isinstance(images, ImageEmbeddingItems): + # For image embedding inputs, only support no crops cases + # since it's not supported in hf processor anyway + return self.info.get_image_repl( + image_width=None, + image_height=None, + num_crops=0, + processor=hf_processor, + ) image_size = images.get_image_size(item_idx) return self.info.get_image_repl( @@ -557,17 +586,19 @@ def _parse_and_validate_image_input( pixel_values = kwargs.pop("pixel_values", None) num_patches = kwargs.pop("num_patches", None) image_embeds = kwargs.pop("image_embeds", None) - assert image_embeds is None, "Gemma3 does not support image_embeds." - if pixel_values is None: - return None - image_size = self.config.vision_config.image_size - - return Gemma3ImagePixelInputs( - pixel_values=pixel_values, - num_patches=num_patches, - resolve_bindings={"h": image_size, "w": image_size}, - ) + if pixel_values is not None: + image_size = self.config.vision_config.image_size + return Gemma3ImagePixelInputs( + pixel_values=pixel_values, + num_patches=num_patches, + resolve_bindings={"h": image_size, "w": image_size}, + ) + elif image_embeds is not None: + return Gemma3ImageEmbeddingInputs( + image_embeds=image_embeds, + type="image_embeds", + ) def _image_pixels_to_features( self, @@ -579,7 +610,9 @@ def _image_pixels_to_features( def _process_image_input( self, image_input: Gemma3ImageInputs, - ) -> list[torch.Tensor]: + ) -> torch.Tensor | list[torch.Tensor]: + if image_input["type"] == "image_embeds": + return image_input["image_embeds"] assert self.vision_tower is not None pixel_values = image_input["pixel_values"] diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 2fa3f6ebcc11..810f29072a0f 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -359,8 +359,9 @@ def __init__( ) self.video_needs_metadata = video_needs_metadata - def _is_embeddings( - self, data: object + @classmethod + def is_embeddings( + cls, data: object ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]: if isinstance(data, torch.Tensor): return data.ndim == 3 @@ -420,7 +421,7 @@ def _parse_audio_data( ): return None - if self._is_embeddings(data): + if self.is_embeddings(data): return AudioEmbeddingItems(data) data_items: list[AudioItem] @@ -458,7 +459,7 @@ def _parse_image_data( if self._is_empty(data): return None - if self._is_embeddings(data): + if self.is_embeddings(data): return ImageEmbeddingItems(data) if ( @@ -484,7 +485,7 @@ def _parse_video_data( if self._is_empty(data): return None - if self._is_embeddings(data): + if self.is_embeddings(data): return VideoEmbeddingItems(data) data_items: list[VideoItem] diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0404f6ff2771..fffd075a5165 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -14,6 +14,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import processor_cache_from_config from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams @@ -340,7 +341,12 @@ def _extract_mm_data(p: PromptType): mm_uuids: dict[str, list[str | None] | str] = {} for modality, data in mm_data.items(): - n = len(data) if isinstance(data, list) else 1 + # Hash each item for embedding inputs. + n = ( + len(data) + if isinstance(data, list) or MultiModalDataParser.is_embeddings(data) + else 1 + ) mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] return mm_uuids From 89d3679221023fc18fd47df8fc426347fa9694e1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Nov 2025 21:33:27 +0800 Subject: [PATCH 100/578] [Doc] Fix failing doc build (#28772) Signed-off-by: DarkLight1337 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/README.md | 4 +- docs/cli/bench/latency.md | 4 +- docs/cli/bench/serve.md | 4 +- docs/cli/bench/sweep/plot.md | 4 +- docs/cli/bench/sweep/serve.md | 4 +- docs/cli/bench/sweep/serve_sla.md | 4 +- docs/cli/bench/throughput.md | 4 +- docs/cli/chat.md | 4 +- docs/cli/complete.md | 4 +- docs/cli/run-batch.md | 4 +- docs/cli/serve.md | 4 +- docs/configuration/serve_args.md | 2 +- docs/mkdocs/hooks/generate_argparse.py | 77 ++++++++++++++++---------- docs/usage/README.md | 2 +- 14 files changed, 72 insertions(+), 53 deletions(-) diff --git a/docs/README.md b/docs/README.md index 0608794e7e65..0c279c19f96c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at Where to get started with vLLM depends on the type of user. If you are looking to: - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md) -- Build applications with vLLM, we recommend starting with the [User Guide](./usage) -- Build vLLM, we recommend starting with [Developer Guide](./contributing) +- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md) +- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md) For information about the development of vLLM, see: diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md index 21ab13e63781..ea7ea7321ffc 100644 --- a/docs/cli/bench/latency.md +++ b/docs/cli/bench/latency.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_latency.md" +--8<-- "docs/argparse/bench_latency.inc.md" diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md index f7c415c6becb..f7dc8036cc26 100644 --- a/docs/cli/bench/serve.md +++ b/docs/cli/bench/serve.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_serve.md" +--8<-- "docs/argparse/bench_serve.inc.md" diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md index f29bffb64655..a101330e093c 100644 --- a/docs/cli/bench/sweep/plot.md +++ b/docs/cli/bench/sweep/plot.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_sweep_plot.md" +--8<-- "docs/argparse/bench_sweep_plot.inc.md" diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md index 5b5f91a951ed..f0468f06fc28 100644 --- a/docs/cli/bench/sweep/serve.md +++ b/docs/cli/bench/sweep/serve.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_sweep_serve.md" +--8<-- "docs/argparse/bench_sweep_serve.inc.md" diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md index 5f8ab6005e50..5642ec67eb00 100644 --- a/docs/cli/bench/sweep/serve_sla.md +++ b/docs/cli/bench/sweep/serve_sla.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_sweep_serve_sla.md" +--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md" diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md index e4ff5ce43c9c..e7f618fb4d14 100644 --- a/docs/cli/bench/throughput.md +++ b/docs/cli/bench/throughput.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/bench_throughput.md" +--8<-- "docs/argparse/bench_throughput.inc.md" diff --git a/docs/cli/chat.md b/docs/cli/chat.md index b006cb8de60d..0246bd431b10 100644 --- a/docs/cli/chat.md +++ b/docs/cli/chat.md @@ -1,5 +1,5 @@ # vllm chat -## Options +## Arguments ---8<-- "docs/argparse/chat.md" +--8<-- "docs/argparse/chat.inc.md" diff --git a/docs/cli/complete.md b/docs/cli/complete.md index 400359acf4fb..eb2ffdaabac2 100644 --- a/docs/cli/complete.md +++ b/docs/cli/complete.md @@ -1,5 +1,5 @@ # vllm complete -## Options +## Arguments ---8<-- "docs/argparse/complete.md" +--8<-- "docs/argparse/complete.inc.md" diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md index f7d401b8dad2..758fbda28397 100644 --- a/docs/cli/run-batch.md +++ b/docs/cli/run-batch.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/run-batch.md" +--8<-- "docs/argparse/run-batch.inc.md" diff --git a/docs/cli/serve.md b/docs/cli/serve.md index 2c8f9d320f5d..35652fec587b 100644 --- a/docs/cli/serve.md +++ b/docs/cli/serve.md @@ -4,6 +4,6 @@ --8<-- "docs/cli/json_tip.inc.md" -## Options +## Arguments ---8<-- "docs/argparse/serve.md" +--8<-- "docs/argparse/serve.inc.md" diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md index c1cc5577bc7a..baaf21f01f06 100644 --- a/docs/configuration/serve_args.md +++ b/docs/configuration/serve_args.md @@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server. ## CLI Arguments The `vllm serve` command is used to launch the OpenAI-compatible server. -To see the available options, take a look at the [CLI Reference](../cli/README.md#options)! +To see the available options, take a look at the [CLI Reference](../cli/README.md)! ## Configuration file diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index ce1c5c53cf35..735074c08b8c 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib +import importlib.metadata +import importlib.util import logging import sys import traceback -from argparse import SUPPRESS, HelpFormatter +from argparse import SUPPRESS, Action, HelpFormatter +from collections.abc import Iterable +from importlib.machinery import ModuleSpec from pathlib import Path -from typing import Literal +from typing import TYPE_CHECKING, Literal from unittest.mock import MagicMock, patch from pydantic_core import core_schema @@ -19,6 +22,11 @@ sys.path.insert(0, str(ROOT_DIR)) +def mock_if_no_torch(mock_module: str, mock: MagicMock): + if not importlib.util.find_spec("torch"): + sys.modules[mock_module] = mock + + # Mock custom op code class MockCustomOp: @staticmethod @@ -29,18 +37,21 @@ def decorator(cls): return decorator -noop = lambda *a, **k: None -sys.modules["vllm._C"] = MagicMock() -sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp) -sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop) +mock_if_no_torch("vllm._C", MagicMock()) +mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp)) +mock_if_no_torch( + "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None) +) + # Mock any version checks by reading from compiled CI requirements with open(ROOT_DIR / "requirements/test.txt") as f: VERSIONS = dict(line.strip().split("==") for line in f if "==" in line) importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0" + # Make torch.nn.Parameter safe to inherit from -sys.modules["torch.nn"] = MagicMock(Parameter=object) +mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) class PydanticMagicMock(MagicMock): @@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock): def __init__(self, *args, **kwargs): name = kwargs.pop("name", None) super().__init__(*args, **kwargs) - self.__spec__ = importlib.machinery.ModuleSpec(name, None) + self.__spec__ = ModuleSpec(name, None) def __get_pydantic_core_schema__(self, source_type, handler): return core_schema.any_schema() -def auto_mock(module, attr, max_mocks=100): +def auto_mock(module_name: str, attr: str, max_mocks: int = 100): """Function that automatically mocks missing modules during imports.""" - logger.info("Importing %s from %s", attr, module) + logger.info("Importing %s from %s", attr, module_name) + for _ in range(max_mocks): try: + module = importlib.import_module(module_name) + # First treat attr as an attr, then as a submodule - return getattr( - importlib.import_module(module), - attr, - importlib.import_module(f"{module}.{attr}"), - ) + if hasattr(module, attr): + return getattr(module, attr) + + return importlib.import_module(f"{module_name}.{attr}") except ModuleNotFoundError as e: + assert e.name is not None logger.info("Mocking %s for argparse doc generation", e.name) sys.modules[e.name] = PydanticMagicMock(name=e.name) - except Exception as e: - logger.warning("Failed to import %s.%s: %s", module, attr, e) + except Exception: + logger.exception("Failed to import %s.%s: %s", module_name, attr) raise ImportError( - f"Failed to import {module}.{attr} after mocking {max_mocks} imports" + f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports" ) @@ -91,21 +105,26 @@ def auto_mock(module, attr, max_mocks=100): CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand") openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args") openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch") -FlexibleArgumentParser = auto_mock( - "vllm.utils.argparse_utils", "FlexibleArgumentParser" -) + +if TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = auto_mock( + "vllm.utils.argparse_utils", "FlexibleArgumentParser" + ) class MarkdownFormatter(HelpFormatter): """Custom formatter that generates markdown for argument groups.""" - def __init__(self, prog, starting_heading_level=3): - super().__init__(prog, max_help_position=float("inf"), width=float("inf")) + def __init__(self, prog: str, starting_heading_level: int = 3): + super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize) + self._section_heading_prefix = "#" * starting_heading_level self._argument_heading_prefix = "#" * (starting_heading_level + 1) self._markdown_output = [] - def start_section(self, heading): + def start_section(self, heading: str): if heading not in {"positional arguments", "options"}: heading_md = f"\n{self._section_heading_prefix} {heading}\n\n" self._markdown_output.append(heading_md) @@ -113,14 +132,14 @@ def start_section(self, heading): def end_section(self): pass - def add_text(self, text): + def add_text(self, text: str): if text: self._markdown_output.append(f"{text.strip()}\n\n") def add_usage(self, usage, actions, groups, prefix=None): pass - def add_arguments(self, actions): + def add_arguments(self, actions: Iterable[Action]): for action in actions: if len(action.option_strings) == 0 or "--help" in action.option_strings: continue @@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser: # Auto-mock runtime imports if tb_list := traceback.extract_tb(e.__traceback__): path = Path(tb_list[-1].filename).relative_to(ROOT_DIR) - auto_mock(module=".".join(path.parent.parts), attr=path.stem) + auto_mock(module_name=".".join(path.parent.parts), attr=path.stem) return create_parser(add_cli_args, **kwargs) else: raise e @@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # Generate documentation for each parser for stem, parser in parsers.items(): - doc_path = ARGPARSE_DOC_DIR / f"{stem}.md" + doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md" # Specify encoding for building on Windows with open(doc_path, "w", encoding="utf-8") as f: f.write(super(type(parser), parser).format_help()) diff --git a/docs/usage/README.md b/docs/usage/README.md index 0c63d01f0f99..4e8ece2c0605 100644 --- a/docs/usage/README.md +++ b/docs/usage/README.md @@ -1,6 +1,6 @@ # Using vLLM -First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment. +First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment. Then, vLLM supports the following usage patterns: From 085a5253321a66d7aac0f990d82417ad85ec0eb0 Mon Sep 17 00:00:00 2001 From: hwhaokun Date: Sat, 15 Nov 2025 21:44:12 +0800 Subject: [PATCH 101/578] [Model] Fix lmhead init bug of bailing_moe (#28777) Signed-off-by: hwhaokun Co-authored-by: zhaozx-cn Co-authored-by: Jee Jee Li --- vllm/model_executor/models/bailing_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 6e1e5b1ddc50..024425bb2440 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -599,7 +599,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config, - prefix=f"{prefix}.lm_head", + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) else: From e439c784fa318dbc23c04b0730bee0fccf46481d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Sat, 15 Nov 2025 15:12:02 +0100 Subject: [PATCH 102/578] Add support for Eagle with separate lm-head and embed_tokens layers (#28549) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- tests/v1/spec_decode/test_eagle.py | 33 +++--- tests/v1/spec_decode/test_mtp.py | 4 + vllm/model_executor/models/deepseek_eagle.py | 3 +- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/interfaces.py | 70 ++++++++++++- vllm/model_executor/models/llama.py | 6 +- vllm/model_executor/models/llama4_eagle.py | 3 +- vllm/model_executor/models/llama_eagle.py | 3 +- vllm/model_executor/models/llama_eagle3.py | 3 +- vllm/model_executor/models/minicpm_eagle.py | 12 ++- vllm/model_executor/models/utils.py | 23 +++++ vllm/v1/spec_decode/eagle.py | 103 +++++++++++++------ 12 files changed, 204 insertions(+), 63 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 421da5241555..805b8c86b080 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -324,6 +324,7 @@ def test_prepare_inputs_padded(): @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("pp_size", [1, 2]) @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False]) +@pytest.mark.parametrize("use_distinct_lm_head", [True, False]) @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group") @mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config") @mock.patch("vllm.v1.spec_decode.eagle.get_model") @@ -335,6 +336,7 @@ def test_load_model( attn_backend, pp_size, use_distinct_embed_tokens, + use_distinct_lm_head, monkeypatch, ): monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) @@ -350,12 +352,13 @@ def test_load_model( # Setup draft model mock mock_model = mock.MagicMock() + mock_model.model = mock.MagicMock() + mock_model.has_own_embed_tokens = use_distinct_embed_tokens if use_distinct_embed_tokens: - # Some models can have a different hidden size than the target model, - # so we test that their embed_tokens doesn't get overwritten - mock_model.model.embed_tokens.weight.shape = (131072, 2048) - else: - mock_model.model.embed_tokens.weight.shape = (131072, 4096) + mock_model.model.embed_tokens = mock.MagicMock() + mock_model.has_own_lm_head = use_distinct_lm_head + if use_distinct_lm_head: + mock_model.lm_head = mock.MagicMock() mock_get_model.return_value = mock_model @@ -391,15 +394,13 @@ class _TargetModelStub(LlamaForCausalLM): target_model = mock.create_autospec(_TargetModelStub, instance=True) target_model.model = mock.MagicMock() - target_model.model.embed_tokens.weight.shape = (131072, 4096) + target_model.lm_head = mock.MagicMock() + target_model.model.embed_tokens = mock.MagicMock() from vllm.model_executor.models import SupportsMultiModal assert not isinstance(target_model, SupportsMultiModal) - if method == "eagle": - target_model.lm_head = mock.MagicMock() - # Create proposer using the helper function proposer = _create_proposer(method, num_speculative_tokens=8) @@ -409,18 +410,18 @@ class _TargetModelStub(LlamaForCausalLM): # Verify common interactions mock_get_model.assert_called_once() - # Verify that EAGLE models gain the lm head from the target model - if method == "eagle": - assert proposer.model.lm_head == target_model.lm_head + # Verify that the lm head is set correctly + if use_distinct_lm_head: + assert proposer.model.lm_head is not target_model.lm_head + else: + assert proposer.model.lm_head is target_model.lm_head # Verify that the embed tokens are set correctly # If pp_size is > 1, the embed tokens should be distinct if pp_size > 1 or use_distinct_embed_tokens: - assert proposer.model.model.embed_tokens != target_model.model.embed_tokens + assert proposer.model.model.embed_tokens is not target_model.model.embed_tokens else: - # When pp_size is 1 and the draft and target models have - # embed_tokens of the same shape, they should be shared. - assert proposer.model.model.embed_tokens == target_model.model.embed_tokens + assert proposer.model.model.embed_tokens is target_model.model.embed_tokens @pytest.mark.parametrize("method", ["eagle", "eagle3"]) diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 6d59b58e739e..c5c0491abaf7 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -67,6 +67,10 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro mock_model = mock.MagicMock() mock_model.model.embed_tokens.weight.shape = (131072, 4096) mock_get_model.return_value = mock_model + # MTP does not have its own embed_tokens or lm_head + # so it should share them with the target model + mock_model.has_own_embed_tokens = False + mock_model.has_own_lm_head = False target_attn_layers = {"target_attn_1": mock.MagicMock()} all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()} diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 9e834a73f8e5..3fb04c3b70dd 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -26,7 +26,7 @@ ) from vllm.utils import init_logger -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight logger = init_logger(__name__) @@ -250,6 +250,7 @@ def transform(inputs): name, loaded_weight = inputs if "lm_head" not in name: name = "model." + name + process_eagle_weight(self, name) return name, loaded_weight loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 115818d903a6..e8ee9951d611 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -85,7 +85,7 @@ ) from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP from .utils import ( PPMissingLayer, is_pp_missing_parameter, @@ -1311,7 +1311,7 @@ def update_physical_experts_metadata( class DeepseekV2ForCausalLM( - nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA + nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA, SupportsEagle ): packed_modules_mapping = { "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 929bfaaee5cb..dc4caf2f02f9 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -932,13 +932,73 @@ def supports_transcription( @runtime_checkable -class SupportsEagle3(Protocol): +class SupportsEagleBase(Protocol): + """Base interface for models that support EAGLE-based speculative decoding.""" + + has_own_lm_head: bool = False + """ + A flag that indicates this model has trained its own lm_head. + """ + + has_own_embed_tokens: bool = False + """ + A flag that indicates this model has trained its own input embeddings. + """ + + +@overload +def supports_any_eagle(model: type[object]) -> TypeIs[type[SupportsEagleBase]]: ... + + +@overload +def supports_any_eagle(model: object) -> TypeIs[SupportsEagleBase]: ... + + +def supports_any_eagle( + model: type[object] | object, +) -> TypeIs[type[SupportsEagleBase]] | TypeIs[SupportsEagleBase]: + """Check if model supports any EAGLE variant (1, 2, or 3).""" + return supports_eagle(model) or supports_eagle3(model) + + +@runtime_checkable +class SupportsEagle(SupportsEagleBase, Protocol): + """The interface required for models that support + EAGLE-1 and EAGLE-2 speculative decoding.""" + + supports_eagle: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports EAGLE-1 and EAGLE-2 + speculative decoding. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + +@overload +def supports_eagle(model: type[object]) -> TypeIs[type[SupportsEagle]]: ... + + +@overload +def supports_eagle(model: object) -> TypeIs[SupportsEagle]: ... + + +def supports_eagle( + model: type[object] | object, +) -> TypeIs[type[SupportsEagle]] | TypeIs[SupportsEagle]: + return isinstance(model, SupportsEagle) + + +@runtime_checkable +class SupportsEagle3(SupportsEagleBase, Protocol): """The interface required for models that support - EAGLE3 speculative decoding.""" + EAGLE-3 speculative decoding.""" supports_eagle3: ClassVar[Literal[True]] = True """ - A flag that indicates this model supports EAGLE3 + A flag that indicates this model supports EAGLE-3 speculative decoding. Note: @@ -949,7 +1009,7 @@ class SupportsEagle3(Protocol): def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: """ Set which layers should output auxiliary - hidden states for EAGLE3. + hidden states for EAGLE-3. Args: layers: Tuple of layer indices that should output auxiliary @@ -960,7 +1020,7 @@ def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: """ Get the layer indices that should output auxiliary hidden states - for EAGLE3. + for EAGLE-3. Returns: Tuple of layer indices for auxiliary hidden state outputs. diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c49a1ea817f9..0a3f37c30ab5 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -58,7 +58,7 @@ ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP +from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -529,7 +529,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params -class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): +class LlamaForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 +): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index e8716d652415..660c8f1bb522 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.utils import extract_layer_index from .interfaces import SupportsMultiModal -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight logger = init_logger(__name__) @@ -212,6 +212,7 @@ def transform(inputs): name, weight = self.permute_qk_weight_for_rotary(name, loaded_weight) if "lm_head" not in name: name = "model." + name + process_eagle_weight(self, name) return name, weight loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index ab2a9f6f06db..0287132c5637 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -17,7 +17,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight logger = init_logger(__name__) @@ -179,6 +179,7 @@ def transform(inputs): name, loaded_weight = inputs if "lm_head" not in name: name = "model." + name + process_eagle_weight(self, name) return name, loaded_weight loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 6edc9519dfbb..a3bcc5eeb32b 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -23,7 +23,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight logger = init_logger(__name__) @@ -324,6 +324,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): if "embed_tokens" in name: includes_embed_tokens = True model_weights[name] = loaded_weight + process_eagle_weight(self, name) skip_substrs = [] if not includes_draft_id_mapping: diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 0ca31913485d..d0cdb70aa857 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsEagle, SupportsLoRA, SupportsPP from .minicpm import MiniCPMAttention as EagleMiniCPMAttention from .minicpm import MiniCPMMLP as EagleMiniCPMMLP from .minicpm import MiniCPMMoE as EagleMiniCPMMoE @@ -52,6 +52,7 @@ is_pp_missing_parameter, make_empty_intermediate_tensors_factory, maybe_prefix, + process_eagle_weight, ) @@ -289,7 +290,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params -class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -376,8 +377,13 @@ def compute_logits( return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def transform(inputs): + name, loaded_weight = inputs + process_eagle_weight(self, name) + return name, loaded_weight + loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(map(transform, weights)) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e5663c8a057a..0d811fbc7585 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -19,6 +19,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import supports_any_eagle from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils.math_utils import cdiv @@ -825,3 +826,25 @@ def sequence_parallel_chunk_impl_fake(x: torch.Tensor) -> torch.Tensor: fake_impl=sequence_parallel_chunk_impl_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) + + +def process_eagle_weight( + model: nn.Module, + name: str, +) -> None: + """ + Update EAGLE model flags based on loaded weight name. + This should be called during weight loading to detect if a model + has its own lm_head or embed_tokens weight. + Args: + model: The model instance (must support EAGLE) + name: The name of the weight to process + """ + if not supports_any_eagle(model): + return + + # To prevent overriding with target model's layers + if "lm_head" in name: + model.has_own_lm_head = True + if "embed_tokens" in name: + model.has_own_embed_tokens = True diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f3b34544f8d9..ed602f39d0f9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -991,6 +991,7 @@ def load_model(self, target_model: nn.Module) -> None: target_language_model = target_model.get_language_model() else: target_language_model = target_model + # share embed_tokens with the target model if needed if get_pp_group().world_size == 1: if hasattr(target_language_model.model, "embed_tokens"): @@ -1002,52 +1003,92 @@ def load_model(self, target_model: nn.Module) -> None: "Target model does not have 'embed_tokens' or 'embedding' attribute" ) - # Check if shapes match and we found the embedding - eagle_shape = self.model.model.embed_tokens.weight.shape - target_shape = target_embed_tokens.weight.shape - if eagle_shape == target_shape: - logger.info( - "Assuming the EAGLE head shares the same vocab embedding" - " with the target model." - ) - del self.model.model.embed_tokens - self.model.model.embed_tokens = target_embed_tokens + share_embeddings = False + if hasattr(self.model, "has_own_embed_tokens"): + # EAGLE model + if not self.model.has_own_embed_tokens: + share_embeddings = True + logger.info( + "Detected EAGLE model without its own embed_tokens in the" + " checkpoint. Sharing target model embedding weights with the" + " draft model." + ) + elif ( + isinstance(target_embed_tokens.weight, torch.Tensor) + and isinstance(self.model.model.embed_tokens.weight, torch.Tensor) + and torch.equal( + target_embed_tokens.weight, self.model.model.embed_tokens.weight + ) + ): + share_embeddings = True + logger.info( + "Detected EAGLE model with embed_tokens identical to the target" + " model. Sharing target model embedding weights with the draft" + " model." + ) + else: + logger.info( + "Detected EAGLE model with distinct embed_tokens weights. " + "Keeping separate embedding weights from the target model." + ) else: + # MTP model + share_embeddings = True logger.info( - "The EAGLE head's vocab embedding will be loaded separately" - " from the target model." + "Detected MTP model. " + "Sharing target model embedding weights with the draft model." ) + + if share_embeddings: + if hasattr(self.model.model, "embed_tokens"): + del self.model.model.embed_tokens + self.model.model.embed_tokens = target_embed_tokens else: logger.info( - "The EAGLE head's vocab embedding will be loaded separately" + "The draft model's vocab embedding will be loaded separately" " from the target model." ) # share lm_head with the target model if needed - # some model definition do not define lm_head explicitly - # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM - if self.vllm_config.speculative_config.method != "eagle3": - if hasattr(target_language_model, "lm_head"): - logger.info("Loading EAGLE LM head weights from the target model.") - self.model.lm_head = target_language_model.lm_head - else: - if ( - hasattr(self.model, "lm_head") - and hasattr(target_language_model, "lm_head") - and self.model.lm_head.weight.shape - == target_language_model.lm_head.weight.shape + share_lm_head = False + if hasattr(self.model, "has_own_lm_head"): + # EAGLE model + if not self.model.has_own_lm_head: + share_lm_head = True + logger.info( + "Detected EAGLE model without its own lm_head in the checkpoint. " + "Sharing target model lm_head weights with the draft model." + ) + elif ( + hasattr(target_language_model, "lm_head") + and isinstance(target_language_model.lm_head.weight, torch.Tensor) + and isinstance(self.model.lm_head.weight, torch.Tensor) + and torch.equal( + target_language_model.lm_head.weight, self.model.lm_head.weight + ) ): + share_lm_head = True logger.info( - "Assuming the EAGLE head shares the same lm_head" - " with the target model." + "Detected EAGLE model with lm_head identical to the target model. " + "Sharing target model lm_head weights with the draft model." ) - del self.model.lm_head - self.model.lm_head = target_language_model.lm_head else: logger.info( - "The EAGLE head's lm_head will be loaded separately" - " from the target model." + "Detected EAGLE model with distinct lm_head weights. " + "Keeping separate lm_head weights from the target model." ) + else: + # MTP model + share_lm_head = True + logger.info( + "Detected MTP model. " + "Sharing target model lm_head weights with the draft model." + ) + + if share_lm_head and hasattr(target_language_model, "lm_head"): + if hasattr(self.model, "lm_head"): + del self.model.lm_head + self.model.lm_head = target_language_model.lm_head @torch.inference_mode() def dummy_run( From 637f292196237982558936166540ed8d153a75eb Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 15 Nov 2025 08:44:14 -0800 Subject: [PATCH 103/578] [CI] Fix broken pipeline (#28781) Signed-off-by: Nick Hill --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 723f311a2646..4ac76aba67b9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -926,7 +926,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion & Compile Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 From 07cadab27a23bf1fbc1090f77fcc650eeb1612e8 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 15 Nov 2025 19:03:09 +0000 Subject: [PATCH 104/578] [Model][Qwen3VL] Cache positional embedding indices (#28475) Signed-off-by: Lukas Geiger Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 57 +++++++++++++++----------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index fa6b71bf9268..7f0c9372991d 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -25,7 +25,7 @@ """Inference-only Qwen3VL model compatible with HuggingFace weights.""" from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence -from functools import partial +from functools import lru_cache, partial from itertools import islice from typing import Any @@ -416,30 +416,41 @@ def dtype(self) -> torch.dtype: def device(self) -> torch.device: return self.patch_embed.proj.weight.device + @staticmethod + @lru_cache(maxsize=1024) + def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor: + hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w)) + h_div = h // spatial_merge_size + w_div = w // spatial_merge_size + hpos_ids = hpos_ids.reshape( + h_div, + spatial_merge_size, + w_div, + spatial_merge_size, + ) + hpos_ids = hpos_ids.transpose(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w)) + wpos_ids = wpos_ids.reshape( + h_div, + spatial_merge_size, + w_div, + spatial_merge_size, + ) + wpos_ids = wpos_ids.transpose(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + + return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1)) + def rot_pos_emb(self, grid_thw: list[list[int]]): - pos_ids = [] max_grid_size = max(max(h, w) for _, h, w in grid_thw) - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - hpos_ids = hpos_ids.permute(0, 2, 1, 3) - hpos_ids = hpos_ids.flatten() - - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - wpos_ids = wpos_ids.permute(0, 2, 1, 3) - wpos_ids = wpos_ids.flatten() - pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = [ + self.rot_pos_ids(h, w, self.spatial_merge_size) + if t == 1 + else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1) + for t, h, w in grid_thw + ] pos_ids = torch.cat(pos_ids, dim=0) rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) From 2bb4435cb7e2e2317b0f20803347690fb38fe6b4 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:27:50 +0100 Subject: [PATCH 105/578] [Doc]: fix typos in various files (#28567) Signed-off-by: Didier Durand --- docs/design/moe_kernel_features.md | 2 +- docs/features/quantization/quark.md | 2 +- vllm/compilation/compiler_interface.py | 2 +- vllm/compilation/decorators.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index ee224e6922fb..7663b82266f0 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. ## Fused MoE Experts Kernels -The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index be0702f4c9e1..bd7bc186e13a 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later. -As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are: +As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are: - amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8 - amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8 diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index b0cdb08884a3..11cf0f85c178 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -299,7 +299,7 @@ def initialize_cache( self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir if disable_cache: return - # redirect the cache directory to a sub-directory + # redirect the cache directory to a subdirectory # set flags so that Inductor and Triton store their cache # in the cache_dir, then users only need to copy the cache_dir # to another machine to reuse the cache. diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index e325bca73abb..11a18c0e6bb7 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -159,7 +159,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ... `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic dim to be decorated with `mark_unbacked`. This is useful if we would like to - enforce that dynamo do not specialize on 0/1 values in the case of dummy input + enforce that dynamo does not specialize on 0/1 values in the case of dummy input such as for vision model compilation """ @@ -483,7 +483,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): Context manager to set/unset customized cudagraph partition wrappers. If we're using Inductor-based graph partitioning, we currently have the - whole `fx.Graph` before Inductor lowering and and the piecewise + whole `fx.Graph` before Inductor lowering and the piecewise splitting happens after all graph passes and fusions. Here, we add a custom hook for Inductor to wrap each partition with our static graph wrapper class to maintain more control over static graph diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0d6164180e6..6590ca54af68 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2871,7 +2871,7 @@ def propose_draft_token_ids( "gpu_model_runner: set_async_sampled_token_ids" ): # Save ref of sampled_token_ids CPU tensor if the batch contains - # any requests with sampling params that that require output ids. + # any requests with sampling params that require output ids. self.input_batch.set_async_sampled_token_ids( async_output.sampled_token_ids_cpu, async_output.async_copy_ready_event, From be263f76451ad8a32baf0b935d3f0432d05300e6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 15 Nov 2025 17:35:06 -0500 Subject: [PATCH 106/578] [BugFix] Fix `AssertionError: DCP not support reorder_batch_threshold > 1 now.` (#28751) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6590ca54af68..ffbac5fe12f7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -630,16 +630,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: return if self.reorder_batch_threshold is not None: - # NOTE(lucas): currently no backend supports the custom masking - # required for DCP with q_len > 1, so we assert here. Remove this - # assert once the custom mask is support is added to FA3. - if ( - self.dcp_world_size > 1 - and envs.VLLM_ATTENTION_BACKEND != "FLASH_ATTN_MLA" - ): - assert self.reorder_batch_threshold == 1, ( - "DCP not support reorder_batch_threshold > 1 now." - ) reorder_batch_to_split_decodes_and_prefills( self.input_batch, scheduler_output, From f849ee739cdb3d82fce1660a6fd91806e8ae9bff Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Sun, 16 Nov 2025 00:22:17 -0500 Subject: [PATCH 107/578] Adding a benchmark for batch invariance (#28161) Signed-off-by: Bram Wasti Signed-off-by: Bram Wasti Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- benchmarks/benchmark_batch_invariance.py | 380 +++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100755 benchmarks/benchmark_batch_invariance.py diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py new file mode 100755 index 000000000000..b5c16c42de46 --- /dev/null +++ b/benchmarks/benchmark_batch_invariance.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode. + +This benchmark runs the same workload twice: +1. With VLLM_BATCH_INVARIANT=0 (baseline) +2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode) + +And reports the timing and throughput metrics for comparison. + +Environment variables: + VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B") + VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek) + VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128) + VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5) + VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024) + VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048) + VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128) + VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0) + VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4) + VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120) + VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN) + +Example usage: + # Benchmark qwen3 (default) + python benchmarks/benchmark_batch_invariance.py + + # Benchmark deepseek with 8 GPUs + VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\ + python benchmarks/benchmark_batch_invariance.py + + # Quick test with fewer trials + VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\ + python benchmarks/benchmark_batch_invariance.py +""" + +import contextlib +import os +import random +import time + +from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + + +def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: + """Generate a random prompt for benchmarking.""" + prompt_templates = [ + "Question: What is the capital of France?\nAnswer: The capital of France is", + "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which", + "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is", + "Once upon a time in a distant galaxy, there lived", + "The old man walked slowly down the street, remembering", + "In the year 2157, humanity finally discovered", + "To implement a binary search tree in Python, first we need to", + "The algorithm works by iterating through the array and", + "Here's how to optimize database queries using indexing:", + "The Renaissance was a period in European history that", + "Climate change is caused by several factors including", + "The human brain contains approximately 86 billion neurons which", + "I've been thinking about getting a new laptop because", + "Yesterday I went to the store and bought", + "My favorite thing about summer is definitely", + ] + + base_prompt = random.choice(prompt_templates) + + if max_words < min_words: + max_words = min_words + target_words = random.randint(min_words, max_words) + + if target_words > 50: + padding_text = ( + " This is an interesting topic that deserves more explanation. " + * (target_words // 50) + ) + base_prompt = base_prompt + padding_text + + return base_prompt + + +def run_benchmark_with_batch_invariant( + model: str, + tp_size: int, + max_batch_size: int, + num_trials: int, + min_prompt: int, + max_prompt: int, + max_tokens: int, + temperature: float, + gpu_mem_util: float, + max_model_len: int, + backend: str, + batch_invariant: bool, + seed: int = 12345, +) -> dict: + """ + Run the benchmark with the specified configuration. + + Returns a dict with timing and throughput metrics. + """ + random.seed(seed) + + # Set environment variables + os.environ["VLLM_ATTENTION_BACKEND"] = backend + if batch_invariant: + os.environ["VLLM_BATCH_INVARIANT"] = "1" + else: + os.environ["VLLM_BATCH_INVARIANT"] = "0" + + print(f"\n{'=' * 80}") + print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}") + print(f" Model: {model}") + print(f" TP Size: {tp_size}") + print(f" Backend: {backend}") + print(f" Max Batch Size: {max_batch_size}") + print(f" Trials: {num_trials}") + print(f" Max Tokens: {max_tokens}") + print(f"{'=' * 80}\n") + + sampling = SamplingParams( + temperature=temperature, + top_p=0.95, + max_tokens=max_tokens, + seed=20240919, + ) + + needle_prompt = "There once was a " + + llm = None + try: + # Create LLM engine + start_init = time.perf_counter() + llm = LLM( + model=model, + max_num_seqs=max_batch_size, + gpu_memory_utilization=gpu_mem_util, + max_model_len=max_model_len, + dtype="bfloat16", + tensor_parallel_size=tp_size, + enable_prefix_caching=False, + ) + init_time = time.perf_counter() - start_init + print(f"Engine initialization time: {init_time:.2f}s\n") + + # Generate baseline + print("Generating baseline (warmup)...") + baseline_out = llm.generate([needle_prompt], sampling) + assert len(baseline_out) == 1 + baseline_text = baseline_out[0].outputs[0].text + print(f"Baseline output: '{baseline_text[:50]}...'\n") + + # Run trials and measure timing + trial_times: list[float] = [] + total_tokens = 0 + total_prompts = 0 + + for trial in range(num_trials): + # Create a batch + prompts: list[str] = [] + batch_size = random.randint(max_batch_size // 2, max_batch_size) + needle_pos = random.randint(0, batch_size - 1) + for i in range(batch_size): + if i == needle_pos: + prompts.append(needle_prompt) + else: + prompts.append(_random_prompt(min_prompt, max_prompt)) + + # Measure time for this trial + start_time = time.perf_counter() + outputs = llm.generate(prompts, sampling) + trial_time = time.perf_counter() - start_time + + trial_times.append(trial_time) + total_prompts += len(prompts) + + # Count tokens + for output in outputs: + if output.outputs: + total_tokens += len(output.outputs[0].token_ids) + + print( + f"Trial {trial + 1}/{num_trials}: " + f"batch_size={batch_size}, " + f"time={trial_time:.2f}s" + ) + + # Verify needle output still matches + needle_output = outputs[needle_pos] + assert needle_output.prompt == needle_prompt + + # Compute statistics + avg_time = sum(trial_times) / len(trial_times) + min_time = min(trial_times) + max_time = max(trial_times) + throughput = total_tokens / sum(trial_times) + prompts_per_sec = total_prompts / sum(trial_times) + + print(f"\n{'=' * 80}") + print("RESULTS:") + print(f" Average time per trial: {avg_time:.2f}s") + print(f" Min time: {min_time:.2f}s") + print(f" Max time: {max_time:.2f}s") + print(f" Total tokens generated: {total_tokens}") + print(f" Total prompts processed: {total_prompts}") + print(f" Throughput: {throughput:.2f} tokens/s") + print(f" Prompts/s: {prompts_per_sec:.2f}") + print(f"{'=' * 80}\n") + + return { + "init_time": init_time, + "avg_time": avg_time, + "min_time": min_time, + "max_time": max_time, + "total_tokens": total_tokens, + "total_prompts": total_prompts, + "throughput": throughput, + "prompts_per_sec": prompts_per_sec, + "trial_times": trial_times, + } + + finally: + # Cleanup + if llm is not None: + with contextlib.suppress(Exception): + llm.shutdown() + + +def main(): + # Check platform support + if not (current_platform.is_cuda() and current_platform.has_device_capability(90)): + print("ERROR: Requires CUDA and >= Hopper (SM90)") + print(f"Current platform: {current_platform.device_type}") + if current_platform.is_cuda(): + print(f"Device capability: {current_platform.get_device_capability()}") + return 1 + + # Read configuration from environment + model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B") + tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1")) + max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128")) + num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5")) + min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024")) + max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048")) + max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128")) + temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0")) + gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4")) + max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120")) + backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN") + + print("\n" + "=" * 80) + print("VLLM BATCH INVARIANCE BENCHMARK") + print("=" * 80) + print("\nConfiguration:") + print(f" Model: {model}") + print(f" Tensor Parallel Size: {tp_size}") + print(f" Attention Backend: {backend}") + print(f" Max Batch Size: {max_batch_size}") + print(f" Number of Trials: {num_trials}") + print(f" Prompt Length Range: {min_prompt}-{max_prompt} words") + print(f" Max Tokens to Generate: {max_tokens}") + print(f" Temperature: {temperature}") + print(f" GPU Memory Utilization: {gpu_mem_util}") + print(f" Max Model Length: {max_model_len}") + print("=" * 80) + + # Run benchmark WITHOUT batch invariance (baseline) + print("\n" + "=" * 80) + print("PHASE 1: Running WITHOUT batch invariance (baseline)") + print("=" * 80) + baseline_results = run_benchmark_with_batch_invariant( + model=model, + tp_size=tp_size, + max_batch_size=max_batch_size, + num_trials=num_trials, + min_prompt=min_prompt, + max_prompt=max_prompt, + max_tokens=max_tokens, + temperature=temperature, + gpu_mem_util=gpu_mem_util, + max_model_len=max_model_len, + backend=backend, + batch_invariant=False, + ) + + # Run benchmark WITH batch invariance + print("\n" + "=" * 80) + print("PHASE 2: Running WITH batch invariance") + print("=" * 80) + batch_inv_results = run_benchmark_with_batch_invariant( + model=model, + tp_size=tp_size, + max_batch_size=max_batch_size, + num_trials=num_trials, + min_prompt=min_prompt, + max_prompt=max_prompt, + max_tokens=max_tokens, + temperature=temperature, + gpu_mem_util=gpu_mem_util, + max_model_len=max_model_len, + backend=backend, + batch_invariant=True, + ) + + # Compare results + print("\n" + "=" * 80) + print("COMPARISON: Batch Invariance vs Baseline") + print("=" * 80) + + init_overhead_pct = ( + (batch_inv_results["init_time"] - baseline_results["init_time"]) + / baseline_results["init_time"] + * 100 + ) + time_overhead_pct = ( + (batch_inv_results["avg_time"] - baseline_results["avg_time"]) + / baseline_results["avg_time"] + * 100 + ) + throughput_change_pct = ( + (batch_inv_results["throughput"] - baseline_results["throughput"]) + / baseline_results["throughput"] + * 100 + ) + + print("\nInitialization Time:") + print(f" Baseline: {baseline_results['init_time']:.2f}s") + print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s") + print(f" Overhead: {init_overhead_pct:+.2f}%") + + print("\nAverage Trial Time:") + print(f" Baseline: {baseline_results['avg_time']:.2f}s") + print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s") + print(f" Overhead: {time_overhead_pct:+.2f}%") + + print("\nThroughput (tokens/s):") + print(f" Baseline: {baseline_results['throughput']:.2f}") + print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}") + print(f" Change: {throughput_change_pct:+.2f}%") + + print("\nPrompts/s:") + print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}") + print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}") + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + if time_overhead_pct > 0: + print( + f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% " + "overhead" + ) + else: + print( + f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% " + "faster (unexpected!)" + ) + + if abs(throughput_change_pct) < 1.0: + print("Throughput difference is negligible (< 1%)") + elif throughput_change_pct < 0: + print( + f"Throughput decreased by {-throughput_change_pct:.1f}% " + "with batch invariance" + ) + else: + print( + f"Throughput increased by {throughput_change_pct:.1f}% " + "with batch invariance (unexpected!)" + ) + + print("=" * 80 + "\n") + + return 0 + + +if __name__ == "__main__": + exit(main()) From d231876ce31d8738a6e13a13591ae7d90d8b93f7 Mon Sep 17 00:00:00 2001 From: ai-jz <156989844+ai-jz@users.noreply.github.com> Date: Sat, 15 Nov 2025 23:04:32 -0800 Subject: [PATCH 108/578] [Benchmark] Fix client seed synchronization in multi-turn benchmark (#28512) Signed-off-by: ai-jz --- benchmarks/multi_turn/benchmark_serving_multi_turn.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index ae9e9753441a..772d685ad90f 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -561,8 +561,11 @@ async def client_main( f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 ) - random.seed(args.seed) - np.random.seed(args.seed) + # Set unique seed per client (each client runs in its own process) + # Add 1 to ensure no client uses the same seed as the main process + client_seed = args.seed + client_id + 1 + random.seed(client_seed) + np.random.seed(client_seed) # Active conversations active_convs: ConversationsMap = {} @@ -1490,6 +1493,7 @@ async def main() -> None: f"Invalid --warmup-percentage={args.warmup_percentage}" ) from None + # Set global seeds for main process random.seed(args.seed) np.random.seed(args.seed) From a55b64635c272ff1f34d20593140faa1fcbe4580 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sun, 16 Nov 2025 16:04:50 +0800 Subject: [PATCH 109/578] [Model] Allow users to control skip reading cache per request. (#28194) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi --- .../pooling/test_extract_hidden_states.py | 29 +++++++++++++++++-- vllm/pooling_params.py | 12 ++++++++ vllm/sampling_params.py | 8 +++++ vllm/v1/core/kv_cache_manager.py | 11 ++++--- vllm/v1/request.py | 15 ++++++++++ 5 files changed, 67 insertions(+), 8 deletions(-) diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py index f8e3fa7d1560..0d41b93233d5 100644 --- a/tests/models/language/pooling/test_extract_hidden_states.py +++ b/tests/models/language/pooling/test_extract_hidden_states.py @@ -11,7 +11,7 @@ ["Qwen/Qwen3-0.6B"], ) @torch.inference_mode -def test_embed_models(hf_runner, vllm_runner, model: str): +def test_extract_hidden_states(hf_runner, vllm_runner, model: str): n_prompt_tokens = [55, 56, 57] token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens] @@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str): enforce_eager=True, runner="pooling", enable_chunked_prefill=False, - enable_prefix_caching=False, + enable_prefix_caching=True, ) as vllm_model: pooling_outputs = vllm_model.llm.encode( [TokensPrompt(prompt_token_ids=t) for t in token_prompts], @@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str): for n, output in zip(n_prompt_tokens, pooling_outputs): assert len(output.prompt_token_ids) == n + assert len(output.outputs.data) == n assert output.num_cached_tokens == 0 + + # test enable_prefix_caching plus all pooling + # we need to skip reading cache at this request by + # request.skip_reading_prefix_cache + pooling_outputs = vllm_model.llm.encode( + [TokensPrompt(prompt_token_ids=t) for t in token_prompts], + pooling_task="token_embed", + ) + + for n, output in zip(n_prompt_tokens, pooling_outputs): + assert len(output.prompt_token_ids) == n + assert len(output.outputs.data) == n + assert output.num_cached_tokens == 0 + + # skip_reading_prefix_cache can still write to cache + # to accelerate following requests + pooling_outputs = vllm_model.llm.encode( + [TokensPrompt(prompt_token_ids=t) for t in token_prompts], + pooling_task="embed", + ) + + for n, output in zip(n_prompt_tokens, pooling_outputs): + assert len(output.prompt_token_ids) == n + assert output.num_cached_tokens > 0 diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 72a8320cc1bf..5c3dfa8ac9cb 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -57,6 +57,7 @@ class PoolingParams( ## Internal use only task: PoolingTask | None = None requires_token_ids: bool = False + skip_reading_prefix_cache: bool = None extra_kwargs: dict[str, Any] | None = None output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY @@ -93,6 +94,8 @@ def verify( # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": + if self.skip_reading_prefix_cache is None: + self.skip_reading_prefix_cache = True return # NOTE: Task validation needs to done against the model instance, @@ -122,6 +125,15 @@ def _merge_default_parameters( if getattr(self, k, None) is None: setattr(self, k, getattr(pooler_config, k)) + if self.skip_reading_prefix_cache is None: + # If prefix caching is enabled, + # the output of all pooling may less than n_prompt_tokens, + # we need to skip reading cache at this request. + if self.task in ["token_embed", "token_classify"]: + self.skip_reading_prefix_cache = True + else: + self.skip_reading_prefix_cache = False + self._verify_step_pooling(pooler_config, valid_parameters) def _verify_step_pooling( diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index dd820840410e..901d66163452 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -254,6 +254,8 @@ class SamplingParams( generated token can complete the sequence.""" _bad_words_token_ids: list[list[int]] | None = None + skip_reading_prefix_cache: bool = None + @staticmethod def from_optional( n: int | None = 1, @@ -414,6 +416,12 @@ def __post_init__(self) -> None: self.structured_outputs = self.guided_decoding self.guided_decoding = None + if self.skip_reading_prefix_cache is None: + # If prefix caching is enabled, + # the output of prompt logprobs may less than n_prompt_tokens, + # we need to skip reading cache at this request. + self.skip_reading_prefix_cache = self.prompt_logprobs is not None + def _verify_args(self) -> None: if not isinstance(self.n, int): raise ValueError(f"n must be an int, but is of type {type(self.n)}") diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 63a1ff06e404..7f405fc248ac 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -185,12 +185,11 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]: - A list of blocks that are computed for the request. - The number of computed tokens. """ - # Prefix caching is disabled or - # When the request requires prompt logprobs, we skip prefix caching. - if not self.enable_caching or ( - request.sampling_params is not None - and request.sampling_params.prompt_logprobs is not None - ): + # We skip finding the prefix cache hit when prefix caching is + # disabled or the request is marked as skipping kv cache read + # (which happens when the request requires prompt logprobs + # or calls a pooling model with all pooling). + if not self.enable_caching or request.skip_reading_prefix_cache: return self.empty_kv_cache_blocks, 0 # NOTE: When all tokens hit the cache, we must recompute the last token diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 7a5f1183ed48..3d92906fbf4b 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -127,6 +127,8 @@ def __init__( self.get_hash_new_full_blocks = partial(block_hasher, self) self.block_hashes = self.get_hash_new_full_blocks() + self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache() + @classmethod def from_engine_core_request( cls, @@ -180,6 +182,19 @@ def num_tokens_with_spec(self) -> int: def num_output_tokens(self) -> int: return len(self._output_token_ids) + def get_skip_reading_prefix_cache(self) -> bool: + if ( + self.sampling_params is not None + and self.sampling_params.skip_reading_prefix_cache is not None + ): + return self.sampling_params.skip_reading_prefix_cache + elif ( + self.pooling_params is not None + and self.pooling_params.skip_reading_prefix_cache is not None + ): + return self.pooling_params.skip_reading_prefix_cache + return False + def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) From b316ac658985f542618316b4285bd213dfdde046 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Sun, 16 Nov 2025 01:01:21 -0800 Subject: [PATCH 110/578] [V1] Support MP Executor for multi node distributed inference (#23691) Signed-off-by: Lu Fang Signed-off-by: github-actions[bot] Signed-off-by: Lucia Fang Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Nick Hill Co-authored-by: Nick Hill --- tests/distributed/test_multiproc_executor.py | 437 ++++++++++++++++++ vllm/config/parallel.py | 40 ++ .../device_communicators/shm_broadcast.py | 110 ++++- vllm/distributed/parallel_state.py | 77 ++- vllm/engine/arg_utils.py | 91 +++- vllm/entrypoints/cli/serve.py | 31 +- vllm/v1/engine/utils.py | 15 +- vllm/v1/executor/multiproc_executor.py | 197 ++++++-- vllm/v1/worker/gpu_worker.py | 10 +- vllm/v1/worker/worker_base.py | 4 +- 10 files changed, 930 insertions(+), 82 deletions(-) create mode 100644 tests/distributed/test_multiproc_executor.py diff --git a/tests/distributed/test_multiproc_executor.py b/tests/distributed/test_multiproc_executor.py new file mode 100644 index 000000000000..e741a79bc4ed --- /dev/null +++ b/tests/distributed/test_multiproc_executor.py @@ -0,0 +1,437 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Integration tests for MultiprocExecutor at the executor level. +This test directly tests the executor without going through the LLM interface, +focusing on executor initialization, RPC calls, and distributed execution. +""" + +import multiprocessing +import os + +from tests.utils import multi_gpu_test +from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import get_open_port +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.executor.multiproc_executor import MultiprocExecutor + +MODEL = "facebook/opt-125m" + + +def create_vllm_config( + tensor_parallel_size: int = 1, + pipeline_parallel_size: int = 1, + max_model_len: int = 256, + gpu_memory_utilization: float = 0.3, + distributed_executor_backend: str = "mp", + nnodes: int = 1, + node_rank: int = 0, + master_port: int = 0, +) -> VllmConfig: + """Create a VllmConfig for testing using EngineArgs.""" + engine_args = EngineArgs( + model=MODEL, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + max_model_len=max_model_len, + gpu_memory_utilization=gpu_memory_utilization, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + ) + vllm_config = engine_args.create_engine_config() + + # Override distributed node settings if needed + if nnodes > 1 or node_rank > 0: + vllm_config.parallel_config.nnodes = nnodes + vllm_config.parallel_config.node_rank = node_rank + vllm_config.parallel_config.master_port = master_port + if nnodes > 1: + vllm_config.parallel_config.disable_custom_all_reduce = True + + return vllm_config + + +def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput: + """Create a minimal SchedulerOutput for testing.""" + # This is a simplified version - in practice you'd need proper + # SchedulerOutput construction based on the actual vLLM v1 API + return SchedulerOutput( + scheduled_new_reqs=[], + scheduled_resumed_reqs=[], + scheduled_running_reqs=[], + num_scheduled_tokens={}, + total_num_scheduled_tokens=0, + ) + + +def test_multiproc_executor_initialization(): + """Test that MultiprocExecutor can be initialized with proper config.""" + vllm_config = create_vllm_config( + tensor_parallel_size=1, + pipeline_parallel_size=1, + ) + + # Create executor - this should initialize workers + executor = MultiprocExecutor(vllm_config=vllm_config) + + # Verify executor properties + assert executor.world_size == 1, "World size should be 1 for single GPU" + assert executor.local_world_size == 1, "Local world size should be 1" + assert hasattr(executor, "workers"), "Executor should have workers" + assert len(executor.workers) == 1, "Should have 1 worker for single GPU" + + # Clean up + executor.shutdown() + + +@multi_gpu_test(num_gpus=2) +def test_multiproc_executor_initialization_tensor_parallel(): + """Test MultiprocExecutor initialization with tensor parallelism.""" + vllm_config = create_vllm_config( + tensor_parallel_size=2, + pipeline_parallel_size=1, + ) + + # Create executor + executor = MultiprocExecutor(vllm_config=vllm_config) + + # Verify executor properties + assert executor.world_size == 2, "World size should be 2 for TP=2" + assert executor.local_world_size == 2, "Local world size should be 2" + assert len(executor.workers) == 2, "Should have 2 workers for TP=2" + + # Verify output rank calculation + output_rank = executor._get_output_rank() + assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1" + + # Clean up + executor.shutdown() + + +@multi_gpu_test(num_gpus=2) +def test_multiproc_executor_collective_rpc(): + """Test collective RPC calls to all workers.""" + vllm_config = create_vllm_config( + tensor_parallel_size=2, + pipeline_parallel_size=1, + ) + + # Create executor + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Test check_health RPC - should work without errors + executor.check_health() + + # Test that RPC works correctly + # Note: We're just testing that the RPC mechanism works, + # not testing actual model execution here + assert not executor.is_failed, "Executor should not be in failed state" + + finally: + # Clean up + executor.shutdown() + + +def test_multiproc_executor_failure_callback(): + """Test failure callback registration and invocation.""" + vllm_config = create_vllm_config( + tensor_parallel_size=1, + pipeline_parallel_size=1, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Test callback registration + callback_invoked = [] + + def test_callback(): + callback_invoked.append(True) + + # Register callback + executor.register_failure_callback(test_callback) + + # Callback should not be invoked yet + assert len(callback_invoked) == 0, "Callback should not be invoked immediately" + + # Simulate failure + executor.is_failed = True + + # Register another callback - should be invoked immediately + executor.register_failure_callback(test_callback) + assert len(callback_invoked) == 1, ( + "Callback should be invoked when executor is failed" + ) + + finally: + # Clean up + executor.shutdown() + + +@multi_gpu_test(num_gpus=2) +def test_multiproc_executor_worker_monitor(): + """Test that worker monitor is set up correctly.""" + vllm_config = create_vllm_config( + tensor_parallel_size=2, + pipeline_parallel_size=1, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Verify all worker processes are alive + for worker in executor.workers: + assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive" + + # Verify executor is not in failed state + assert not executor.is_failed, "Executor should not be in failed state" + + finally: + # Clean up + executor.shutdown() + + # After shutdown, workers should be terminated + import time + + time.sleep(0.5) # Give processes time to terminate + for worker in executor.workers: + assert not worker.proc.is_alive(), ( + f"Worker rank {worker.rank} should terminate after shutdown" + ) + + +@multi_gpu_test(num_gpus=2) +def test_multiproc_executor_get_response_message_queues(): + """Test message queue retrieval for different ranks.""" + vllm_config = create_vllm_config( + tensor_parallel_size=2, + pipeline_parallel_size=1, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Get all message queues + all_queues = executor.get_response_mqs() + assert len(all_queues) == 2, "Should have 2 message queues for 2 workers" + + # Get message queue for specific rank + rank0_queue = executor.get_response_mqs(unique_reply_rank=0) + assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0" + + rank1_queue = executor.get_response_mqs(unique_reply_rank=1) + assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1" + + finally: + # Clean up + executor.shutdown() + + +def test_multiproc_executor_shutdown_cleanup(): + """Test that shutdown properly cleans up resources.""" + vllm_config = create_vllm_config( + tensor_parallel_size=1, + pipeline_parallel_size=1, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + # Verify executor is set up + assert hasattr(executor, "workers"), "Executor should have workers" + assert len(executor.workers) > 0, "Should have at least one worker" + + # Shutdown + executor.shutdown() + + # Verify cleanup + import time + + time.sleep(0.5) # Give processes time to terminate + + for worker in executor.workers: + assert not worker.proc.is_alive(), "Worker processes should be terminated" + + # Verify shutdown event is set + assert executor.shutdown_event.is_set(), "Shutdown event should be set" + + # Multiple shutdowns should be safe (idempotent) + executor.shutdown() + executor.shutdown() + + +@multi_gpu_test(num_gpus=4) +def test_multiproc_executor_pipeline_parallel(): + """Test MultiprocExecutor with pipeline parallelism.""" + vllm_config = create_vllm_config( + tensor_parallel_size=2, + pipeline_parallel_size=2, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Verify executor properties + assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2" + assert len(executor.workers) == 4, "Should have 4 workers" + + # Verify output rank calculation + # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3) + # Specifically rank 2 (first rank of last PP stage) + output_rank = executor._get_output_rank() + assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)" + + # Verify max_concurrent_batches for pipeline parallel + assert executor.max_concurrent_batches == 2, ( + "Max concurrent batches should equal PP size" + ) + + finally: + # Clean up + executor.shutdown() + + +def test_multiproc_executor_properties(): + """Test various executor properties and configurations.""" + vllm_config = create_vllm_config( + tensor_parallel_size=1, + pipeline_parallel_size=1, + ) + + executor = MultiprocExecutor(vllm_config=vllm_config) + + try: + # Test supports_pp property + assert MultiprocExecutor.supports_pp is True, ( + "MultiprocExecutor should support pipeline parallelism" + ) + + # Test world_size calculation + assert executor.world_size == ( + executor.parallel_config.tensor_parallel_size + * executor.parallel_config.pipeline_parallel_size + ), "World size should equal TP * PP" + + # Test local_world_size calculation + assert executor.local_world_size == ( + executor.parallel_config.world_size // executor.parallel_config.nnodes + ), "Local world size should be world_size / nnodes" + + finally: + # Clean up + executor.shutdown() + + +@multi_gpu_test(num_gpus=4) +def test_multiproc_executor_multi_node(): + """ + Test MultiprocExecutor with multi-node configuration. + This simulates 2 nodes with TP=4: + - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2 + - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2 + Total world_size = 4, nnodes = 2 + """ + port = get_open_port() + # symm_mem does not work for simulating multi instance in single node + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int): + """Run a single node's executor.""" + executor = None + try: + # Set CUDA_VISIBLE_DEVICES for this node + if node_rank == 0: + os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" + else: + os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" + + # Create config for this node + vllm_config = create_vllm_config( + tensor_parallel_size=4, # Total TP across all nodes + pipeline_parallel_size=1, + nnodes=2, # 2 nodes + node_rank=node_rank, + master_port=port, # same port + ) + + # Create executor for this node + executor = MultiprocExecutor(vllm_config=vllm_config) + + # Verify node-specific properties + assert executor.world_size == 4, ( + f"World size should be 4 on node {node_rank}" + ) + assert executor.local_world_size == 2, ( + f"Local world size should be 2 on node {node_rank}" + ) + assert len(executor.workers) == 2, ( + f"Should have 2 local workers on node {node_rank}" + ) + + # Verify worker ranks are correct for this node + expected_ranks = [node_rank * 2, node_rank * 2 + 1] + actual_ranks = sorted([w.rank for w in executor.workers]) + assert actual_ranks == expected_ranks, ( + f"Node {node_rank} should have workers " + f"with ranks {expected_ranks}, got {actual_ranks}" + ) + # Verify all workers are alive + for worker in executor.workers: + assert worker.proc.is_alive(), ( + f"Worker rank {worker.rank} should be alive on node {node_rank}" + ) + # executor.gen + # Put success result in queue BEFORE shutdown to avoid hanging + result_queue.put({"node": node_rank, "success": True}) + import time + + time.sleep(2) + executor.shutdown() + except Exception as e: + # Put failure result in queue + result_queue.put({"node": node_rank, "success": False, "error": str(e)}) + raise e + finally: + if executor is not None: + executor.shutdown() + + # Create a queue to collect results from both processes + result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue() + + # Start both node processes + processes = [] + for node_rank in range(2): + p = multiprocessing.Process( + target=run_node, + args=(node_rank, result_queue, port), + name=f"Node{node_rank}", + ) + p.start() + processes.append(p) + + # Wait for both processes to complete + all_completed = True + for p in processes: + p.join(timeout=60) + if p.is_alive(): + p.terminate() + p.join(timeout=20) + if p.is_alive(): + p.kill() + p.join() + all_completed = False + + # Check results from both nodes + results: list[dict[str, int | bool]] = [] + while len(results) < 2: + try: + result = result_queue.get(timeout=1) + results.append(result) + except Exception: + pass + assert all_completed, "Not all processes completed successfully" + assert len(results) == 2, f"Expected 2 results, got {len(results)}" + assert results[0]["success"], f"Node 0 failed: {results[0]}" + assert results[1]["success"], f"Node 1 failed: {results[1]}" diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 61bcd15e06a8..9a6326d62e82 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -210,6 +210,18 @@ class ParallelConfig: class is dynamically inherited by the worker class. This is used to inject new attributes and methods to the worker class for use in collective_rpc calls.""" + master_addr: str = "127.0.0.1" + """distributed master address for multi-node distributed + inference when distributed_executor_backend is mp.""" + master_port: int = 29501 + """distributed master port for multi-node distributed + inference when distributed_executor_backend is mp.""" + node_rank: int = 0 + """distributed node rank for multi-node distributed + inference when distributed_executor_backend is mp.""" + nnodes: int = 1 + """num of nodes for multi-node distributed + inference when distributed_executor_backend is mp.""" world_size: int = Field(init=False) """world_size is TPxPP, it affects the number of workers we create.""" @@ -387,6 +399,23 @@ def use_sequence_parallel_moe(self) -> bool: and self.data_parallel_size > 1 ) + @property + def node_rank_within_dp(self) -> int: + return self.node_rank % self.nnodes_within_dp + + @property + def nnodes_within_dp(self) -> int: + if self.nnodes == 1: + return 1 + data_parallel_node_size = ( + self.data_parallel_size // self.data_parallel_size_local + ) + return self.nnodes // data_parallel_node_size + + @property + def local_world_size(self) -> int: + return self.world_size // self.nnodes_within_dp + @staticmethod def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool: tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu") @@ -528,6 +557,8 @@ def __post_init__(self) -> None: ray_found = ray_utils.ray_is_available() if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: backend = "uni" + elif current_platform.is_cuda() and self.nnodes > 1: + backend = "mp" elif ( current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size @@ -565,6 +596,10 @@ def __post_init__(self) -> None: "max_parallel_loading_workers is currently " "not supported and will be ignored." ) + if self.distributed_executor_backend != "mp" and self.nnodes > 1: + raise ValueError( + "nnodes > 1 can only be set when distributed exectuor backend is mp." + ) @property def use_ray(self) -> bool: @@ -607,6 +642,11 @@ def _verify_args(self) -> Self: "Disabled the custom all-reduce kernel because it is not " "supported on current platform." ) + if self.nnodes > 1: + self.disable_custom_all_reduce = True + logger.debug( + "Disabled the custom all-reduce since we are running on multi-node." + ) if self.ray_workers_use_nsight and not self.use_ray: raise ValueError( "Unable to use nsight profiling unless workers run with Ray." diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 5046cac2e90a..052df19e34d7 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -8,7 +8,7 @@ from multiprocessing import shared_memory from pickle import PickleBuffer from threading import Event -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast from unittest.mock import patch import torch @@ -602,13 +602,87 @@ def broadcast_object(self, obj=None): return obj return self.dequeue() + @staticmethod + def create_from_process_group_single_reader( + pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + reader_rank: int = 0, + blocking: bool = False, + ) -> tuple["MessageQueue", list[Handle]]: + """ + Creates a MessageQueue for a process group with a single reader. + + This method is designed for scenarios where only one process (the reader) + will consume messages, and all other processes are writers. It sets up + the shared memory buffer and communication handles accordingly, and + gathers the handles from all processes to the reader. + + Args: + pg (ProcessGroup): The torch distributed process group. + max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer. + max_chunks (int): Maximum number of chunks in the buffer. + reader_rank (int, optional): The global rank that will act as the reader. + Defaults to 0. + blocking (bool, optional): If True, blocks until all processes are ready. + Defaults to False. + + Returns: + tuple[MessageQueue, list[Handle]]: + The MessageQueue instance for the calling process, + and a list of handles (only non-empty for the reader process). + """ + local_size = torch.cuda.device_count() + rank = dist.get_rank() + same_node = rank // local_size == reader_rank // local_size + buffer_io = MessageQueue( + n_reader=1, + n_local_reader=1 if same_node else 0, + max_chunk_bytes=max_chunk_bytes, + max_chunks=max_chunks, + ) + handle = buffer_io.export_handle() + handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None + dist.gather_object(handle, handles, dst=reader_rank, group=pg) + if blocking: + buffer_io.wait_until_ready() + return buffer_io, cast(list[Handle], handles or []) + @staticmethod def create_from_process_group( pg: ProcessGroup | StatelessProcessGroup, max_chunk_bytes, max_chunks, - writer_rank=0, + writer_rank: int = 0, + external_writer_handle=None, + blocking: bool = True, ) -> "MessageQueue": + """ + Creates a MessageQueue for a distributed process group with one writer and + multiple readers. + + This method is designed for scenarios where one process (the writer) sends + messages, and all other processes (the readers) receive messages. It sets up + the shared memory buffer and socket communication handles accordingly, and + broadcasts the handle from the writer to all readers. + + Args: + pg (ProcessGroup | StatelessProcessGroup): The torch distributed process + group. + max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer. + max_chunks (int): Maximum number of chunks in the buffer. + writer_rank (int, optional): The global rank that will act as the writer. + Defaults to 0. + external_writer_handle (Handle, optional): Used when there is a handle + from an external Message Queue. If provided, use this handle to init + PG writer message queue instead of creating a new one. Defaults to None. + blocking (bool, optional): If True, blocks until all processes are ready. + Defaults to True. + + Returns: + MessageQueue: The MessageQueue instance for the calling process. + + """ if isinstance(pg, ProcessGroup): group_rank = dist.get_rank(pg) group_world_size = dist.get_world_size(pg) @@ -617,23 +691,26 @@ def create_from_process_group( group_rank = pg.rank group_world_size = pg.world_size global_ranks = list(range(pg.world_size)) - from vllm.distributed.parallel_state import in_the_same_node_as status = in_the_same_node_as(pg, source_rank=writer_rank) - same_node_ranks = [i for i, s in enumerate(status) if s] - n_reader = group_world_size - 1 - n_local_reader = len(same_node_ranks) - 1 - local_reader_ranks = [i for i in same_node_ranks if i != writer_rank] - buffer_io: MessageQueue if group_rank == writer_rank: - buffer_io = MessageQueue( - n_reader=n_reader, - n_local_reader=n_local_reader, - local_reader_ranks=local_reader_ranks, - max_chunk_bytes=max_chunk_bytes, - max_chunks=max_chunks, - ) + if external_writer_handle is not None: + buffer_io = MessageQueue.create_from_handle( + external_writer_handle, group_rank + ) + else: + same_node_ranks = [i for i, s in enumerate(status) if s] + n_reader = group_world_size - 1 + n_local_reader = len(same_node_ranks) - 1 + local_reader_ranks = [i for i in same_node_ranks if i != writer_rank] + buffer_io = MessageQueue( + n_reader=n_reader, + n_local_reader=n_local_reader, + local_reader_ranks=local_reader_ranks, + max_chunk_bytes=max_chunk_bytes, + max_chunks=max_chunks, + ) handle = buffer_io.export_handle() if isinstance(pg, ProcessGroup): dist.broadcast_object_list( @@ -651,5 +728,6 @@ def create_from_process_group( else: handle = pg.broadcast_obj(None, writer_rank) buffer_io = MessageQueue.create_from_handle(handle, group_rank) - buffer_io.wait_until_ready() + if blocking: + buffer_io.wait_until_ready() return buffer_io diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index c78e6a32733c..852c4c644433 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -385,6 +385,33 @@ def __init__( torch.ops._C, "init_shm_manager" ) + def create_mq_broadcaster( + self, writer_rank=0, external_writer_handle=None, blocking=True + ): + from vllm.distributed.device_communicators.shm_broadcast import MessageQueue + + return MessageQueue.create_from_process_group( + self.cpu_group, + 1 << 22, + 6, + writer_rank=writer_rank, + external_writer_handle=external_writer_handle, + blocking=blocking, + ) + + def create_single_reader_mq_broadcasters( + self, reader_rank_in_group=0, blocking=False + ): + from vllm.distributed.device_communicators.shm_broadcast import MessageQueue + + return MessageQueue.create_from_process_group_single_reader( + self.cpu_group, + 1 << 22, + 6, + reader_rank=self.ranks[reader_rank_in_group], + blocking=blocking, + ) + @property def first_rank(self): """Return the global rank of the first process in the group""" @@ -997,6 +1024,7 @@ def combine( _WORLD: GroupCoordinator | None = None +_INNER_DP_WORLD: GroupCoordinator | None = None _NODE_COUNT: int | None = None @@ -1005,6 +1033,11 @@ def get_world_group() -> GroupCoordinator: return _WORLD +def get_inner_dp_world_group() -> GroupCoordinator: + assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized" + return _INNER_DP_WORLD + + def init_world_group( ranks: list[int], local_rank: int, backend: str ) -> GroupCoordinator: @@ -1023,12 +1056,13 @@ def init_model_parallel_group( backend: str, use_message_queue_broadcaster: bool = False, group_name: str | None = None, + use_device_communicator: bool = True, ) -> GroupCoordinator: return GroupCoordinator( group_ranks=group_ranks, local_rank=local_rank, torch_distributed_backend=backend, - use_device_communicator=True, + use_device_communicator=use_device_communicator, use_message_queue_broadcaster=use_message_queue_broadcaster, group_name=group_name, ) @@ -1143,7 +1177,14 @@ def init_distributed_environment( from vllm.config import get_current_vllm_config config = get_current_vllm_config() - if ( + if config is not None and config.parallel_config.nnodes > 1: + parallel_config = config.parallel_config + ip = parallel_config.master_addr + rank = parallel_config.data_parallel_rank * world_size + rank + world_size = parallel_config.world_size_across_dp + port = parallel_config.master_port + distributed_init_method = get_distributed_init_method(ip, port) + elif ( config is not None and config.parallel_config.data_parallel_size > 1 and config.parallel_config.distributed_executor_backend != "external_launcher" @@ -1164,6 +1205,14 @@ def init_distributed_environment( distributed_init_method, ) if not torch.distributed.is_initialized(): + logger.info( + "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s", + world_size, + rank, + local_rank, + distributed_init_method, + backend, + ) assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " "distributed environment" @@ -1192,16 +1241,36 @@ def init_distributed_environment( # local rank not set, this usually happens in single-node # setting, where we can use rank as local rank local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank - global _WORLD, _NODE_COUNT + global _WORLD, _NODE_COUNT, _INNER_DP_WORLD if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) _WORLD = init_world_group(ranks, local_rank, backend) - _NODE_COUNT = _node_count(_WORLD.cpu_group) + if config.parallel_config.nnodes > 1: + _NODE_COUNT = config.parallel_config.nnodes + else: + _NODE_COUNT = _node_count(_WORLD.cpu_group) logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT) else: assert _WORLD.world_size == torch.distributed.get_world_size(), ( "world group already initialized with a different world size" ) + if config.parallel_config.nnodes_within_dp > 1: + if parallel_config.data_parallel_size > 1: + world_size_inner_dp = parallel_config.world_size + group_ranks = [ + [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)] + for dp_rank in range(parallel_config.data_parallel_size) + ] + _INNER_DP_WORLD = init_model_parallel_group( + group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="inner_dp_world", + use_device_communicator=False, + ) + else: + _INNER_DP_WORLD = _WORLD def initialize_model_parallel( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 999ed780c20b..d011dfdbfbb2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -384,6 +384,10 @@ class EngineArgs: ) = ParallelConfig.distributed_executor_backend # number of P/D disaggregation (or other disaggregation) workers pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size + master_addr: str = ParallelConfig.master_addr + master_port: int = ParallelConfig.master_port + nnodes: int = ParallelConfig.nnodes + node_rank: int = ParallelConfig.node_rank tensor_parallel_size: int = ParallelConfig.tensor_parallel_size decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size @@ -394,6 +398,7 @@ class EngineArgs: data_parallel_address: str | None = None data_parallel_rpc_port: int | None = None data_parallel_hybrid_lb: bool = False + data_parallel_external_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel all2all_backend: str | None = ParallelConfig.all2all_backend @@ -749,6 +754,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "-pp", **parallel_kwargs["pipeline_parallel_size"], ) + parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"]) + parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"]) + parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"]) + parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"]) parallel_group.add_argument( "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"] ) @@ -803,7 +812,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='Backend for data parallel, either "mp" or "ray".', ) parallel_group.add_argument( - "--data-parallel-hybrid-lb", **parallel_kwargs["data_parallel_hybrid_lb"] + "--data-parallel-hybrid-lb", + "-dph", + **parallel_kwargs["data_parallel_hybrid_lb"], + ) + parallel_group.add_argument( + "--data-parallel-external-lb", + "-dpe", + **parallel_kwargs["data_parallel_external_lb"], ) parallel_group.add_argument( "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"] @@ -1428,12 +1444,56 @@ def create_engine_config( assert not headless or not self.data_parallel_hybrid_lb, ( "data_parallel_hybrid_lb is not applicable in headless mode" ) - - data_parallel_external_lb = self.data_parallel_rank is not None + assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), ( + "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True." + ) + assert self.data_parallel_backend == "mp" or self.nnodes == 1, ( + "nnodes > 1 is only supported with data_parallel_backend=mp" + ) + inferred_data_parallel_rank = 0 + if self.nnodes > 1: + world_size = ( + self.data_parallel_size + * self.pipeline_parallel_size + * self.tensor_parallel_size + ) + world_size_within_dp = ( + self.pipeline_parallel_size * self.tensor_parallel_size + ) + local_world_size = world_size // self.nnodes + assert world_size % self.nnodes == 0, ( + f"world_size={world_size} must be divisible by nnodes={self.nnodes}." + ) + assert self.node_rank < self.nnodes, ( + f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}." + ) + inferred_data_parallel_rank = ( + self.node_rank * local_world_size + ) // world_size_within_dp + if self.data_parallel_size > 1 and self.data_parallel_external_lb: + self.data_parallel_rank = inferred_data_parallel_rank + logger.info( + "Inferred data_parallel_rank %d from node_rank %d for external lb", + self.data_parallel_rank, + self.node_rank, + ) + elif self.data_parallel_size_local is None: + # Infer data parallel size local for internal dplb: + self.data_parallel_size_local = max( + local_world_size // world_size_within_dp, 1 + ) + data_parallel_external_lb = ( + self.data_parallel_external_lb or self.data_parallel_rank is not None + ) # Local DP rank = 1, use pure-external LB. if data_parallel_external_lb: + assert self.data_parallel_rank is not None, ( + "data_parallel_rank or node_rank must be spefified if " + "data_parallel_external_lb is enable." + ) assert self.data_parallel_size_local in (1, None), ( - "data_parallel_size_local must be 1 when data_parallel_rank is set" + "data_parallel_size_local must be 1 or None when data_parallel_rank " + "is set" ) data_parallel_size_local = 1 # Use full external lb if we have local_size of 1. @@ -1447,6 +1507,11 @@ def create_engine_config( if self.data_parallel_hybrid_lb and data_parallel_size_local == 1: # Use full external lb if we have local_size of 1. + logger.warning( + "data_parallel_hybrid_lb is not eligible when " + "data_parallel_size_local = 1, autoswitch to " + "data_parallel_external_lb." + ) data_parallel_external_lb = True self.data_parallel_hybrid_lb = False @@ -1454,7 +1519,15 @@ def create_engine_config( # Disable hybrid LB mode if set for a single node self.data_parallel_hybrid_lb = False - self.data_parallel_rank = self.data_parallel_start_rank or 0 + self.data_parallel_rank = ( + self.data_parallel_start_rank or inferred_data_parallel_rank + ) + if self.nnodes > 1: + logger.info( + "Inferred data_parallel_rank %d from node_rank %d", + self.data_parallel_rank, + self.node_rank, + ) else: assert not self.data_parallel_hybrid_lb, ( "data_parallel_size_local must be set to use data_parallel_hybrid_lb." @@ -1484,7 +1557,9 @@ def create_engine_config( "data_parallel_backend can only be ray or mp, got %s", self.data_parallel_backend, ) - data_parallel_address = ParallelConfig.data_parallel_master_ip + data_parallel_address = ( + self.master_addr or ParallelConfig.data_parallel_master_ip + ) else: data_parallel_address = self.data_parallel_address @@ -1517,6 +1592,10 @@ def create_engine_config( data_parallel_rank=self.data_parallel_rank or 0, data_parallel_external_lb=data_parallel_external_lb, data_parallel_size_local=data_parallel_size_local, + master_addr=self.master_addr, + master_port=self.master_port, + nnodes=self.nnodes, + node_rank=self.node_rank, data_parallel_master_ip=data_parallel_address, data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 2678658dd126..96608f360e17 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -24,6 +24,7 @@ from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor import Executor +from vllm.v1.executor.multiproc_executor import MultiprocExecutor from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure @@ -97,18 +98,40 @@ def run_headless(args: argparse.Namespace): if local_engine_count <= 0: raise ValueError("data_parallel_size_local must be > 0 in headless mode") - host = parallel_config.data_parallel_master_ip - port = engine_args.data_parallel_rpc_port # add to config too - handshake_address = get_tcp_uri(host, port) + shutdown_requested = False # Catch SIGTERM and SIGINT to allow graceful shutdown. def signal_handler(signum, frame): + nonlocal shutdown_requested logger.debug("Received %d signal.", signum) - raise SystemExit + if not shutdown_requested: + shutdown_requested = True + raise SystemExit signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) + if parallel_config.node_rank_within_dp > 0: + from vllm.version import __version__ as VLLM_VERSION + + # Run headless workers (for multi-node PP/TP). + host = parallel_config.master_addr + head_node_address = f"{host}:{parallel_config.master_port}" + logger.info( + "Launching vLLM (v%s) headless multiproc executor, " + "with head node address %s for torch.distributed process group.", + VLLM_VERSION, + head_node_address, + ) + + executor = MultiprocExecutor(vllm_config, monitor_workers=False) + executor.start_worker_monitor(inline=True) + return + + host = parallel_config.data_parallel_master_ip + port = parallel_config.data_parallel_rpc_port + handshake_address = get_tcp_uri(host, port) + logger.info( "Launching %d data parallel engine(s) in headless mode, " "with head node address %s.", diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index e74519b21aa6..d65cad7af03d 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -183,15 +183,19 @@ def set_device_control_env_var( for engine subprocess. """ world_size = vllm_config.parallel_config.world_size + local_world_size = vllm_config.parallel_config.local_world_size evar = current_platform.device_control_env_var - value = get_device_indices(evar, local_dp_rank, world_size) + value = get_device_indices(evar, local_dp_rank, world_size, local_world_size) with patch.dict(os.environ, values=((evar, value),)): yield def get_device_indices( - device_control_env_var: str, local_dp_rank: int, world_size: int + device_control_env_var: str, + local_dp_rank: int, + world_size: int, + local_world_size: int | None = None, ): """ Returns a comma-separated string of device indices for the specified @@ -200,10 +204,15 @@ def get_device_indices( For example, if world_size=2 and local_dp_rank=1, and there are 4 devices, this will select devices 2 and 3 for local_dp_rank=1. """ + if local_world_size is None: + local_world_size = world_size try: value = ",".join( str(current_platform.device_id_to_physical_device_id(i)) - for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size) + for i in range( + local_dp_rank * world_size, + local_dp_rank * world_size + local_world_size, + ) ) except IndexError as e: raise Exception( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 881e6ef40aaf..ad2ece50f981 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -10,7 +10,7 @@ import traceback import weakref from collections import deque -from collections.abc import Callable +from collections.abc import Callable, Sequence from concurrent.futures import Future, InvalidStateError from contextlib import suppress from dataclasses import dataclass @@ -34,6 +34,7 @@ get_dcp_group, get_dp_group, get_ep_group, + get_inner_dp_world_group, get_pp_group, get_tp_group, ) @@ -90,6 +91,10 @@ def wait_for_response(self, get_response: Callable): class MultiprocExecutor(Executor): supports_pp: bool = True + def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True): + self.monitor_workers = monitor_workers + super().__init__(vllm_config) + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. @@ -99,6 +104,12 @@ def _init_executor(self) -> None: self.failure_callback: FailureCallback | None = None self.world_size = self.parallel_config.world_size + assert self.world_size % self.parallel_config.nnodes_within_dp == 0, ( + f"global world_size ({self.parallel_config.world_size}) must be " + f"divisible by nnodes_within_dp " + f"({self.parallel_config.nnodes_within_dp}). " + ) + self.local_world_size = self.parallel_config.local_world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size pp_parallel_size = self.parallel_config.pipeline_parallel_size assert self.world_size == tensor_parallel_size * pp_parallel_size, ( @@ -116,27 +127,37 @@ def _init_executor(self) -> None: distributed_init_method = get_distributed_init_method( get_loopback_ip(), get_open_port() ) - + self.rpc_broadcast_mq: MessageQueue | None = None + scheduler_output_handle: Handle | None = None # Initialize worker and set up message queues for SchedulerOutputs # and ModelRunnerOutputs - max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024 - self.rpc_broadcast_mq = MessageQueue( - self.world_size, self.world_size, max_chunk_bytes=max_chunk_bytes - ) - scheduler_output_handle = self.rpc_broadcast_mq.export_handle() - + if self.parallel_config.node_rank_within_dp == 0: + # For leader node within each dp rank, + # each dp will have its own leader multiproc executor. + max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024 + self.rpc_broadcast_mq = MessageQueue( + self.world_size, + self.local_world_size, + max_chunk_bytes=max_chunk_bytes, + connect_ip=self.parallel_config.master_addr, + ) + scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers context = get_mp_context() shared_worker_lock = context.Lock() unready_workers: list[UnreadyWorkerProcHandle] = [] success = False try: - for rank in range(self.world_size): + global_start_rank = ( + self.local_world_size * self.parallel_config.node_rank_within_dp + ) + for local_rank in range(self.local_world_size): + global_rank = global_start_rank + local_rank unready_workers.append( WorkerProc.make_worker_process( vllm_config=self.vllm_config, - local_rank=rank, - rank=rank, + local_rank=local_rank, + rank=global_rank, distributed_init_method=distributed_init_method, input_shm_handle=scheduler_output_handle, shared_worker_lock=shared_worker_lock, @@ -145,15 +166,38 @@ def _init_executor(self) -> None: # Workers must be created before wait_for_ready to avoid # deadlock, since worker.init_device() does a device sync. + + # Wait for all local workers to be ready. self.workers = WorkerProc.wait_for_ready(unready_workers) + # Start background thread to monitor worker health if not in headless mode. + if self.monitor_workers: + self.start_worker_monitor() + + self.response_mqs = [] + # Only leader node have remote response mqs + if self.parallel_config.node_rank_within_dp == 0: + for rank in range(self.world_size): + if rank < self.local_world_size: + local_message_queue = self.workers[rank].worker_response_mq + assert local_message_queue is not None + self.response_mqs.append(local_message_queue) + else: + remote_message_queue = self.workers[0].peer_worker_response_mqs[ + rank + ] + assert remote_message_queue is not None + self.response_mqs.append(remote_message_queue) + # Ensure message queues are ready. Will deadlock if re-ordered # Must be kept consistent with the WorkerProc. - self.rpc_broadcast_mq.wait_until_ready() - for w in self.workers: - w.worker_response_mq.wait_until_ready() - self.start_worker_monitor() + # Wait for all input mqs to be ready. + if self.rpc_broadcast_mq is not None: + self.rpc_broadcast_mq.wait_until_ready() + # Wait for all remote response mqs to be ready. + for response_mq in self.response_mqs: + response_mq.wait_until_ready() success = True finally: if not success: @@ -168,7 +212,7 @@ def _init_executor(self) -> None: self.output_rank = self._get_output_rank() - def start_worker_monitor(self): + def start_worker_monitor(self, inline=False) -> None: workers = self.workers self_ref = weakref.ref(self) @@ -192,9 +236,13 @@ def monitor_workers(): _self.failure_callback = None callback() - Thread( - target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor" - ).start() + if not inline: + Thread( + target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor" + ).start() + return + + monitor_workers() def register_failure_callback(self, callback: FailureCallback): if self.is_failed: @@ -247,7 +295,9 @@ def collective_rpc( # type: ignore[override] ) -> Any | list[Any] | Future[Any | list[Any]]: """Returns single result if unique_reply_rank and/or kv_output_aggregator is provided, otherwise list.""" - + assert self.rpc_broadcast_mq is not None, ( + "collective_rpc should not be called on follower node" + ) if self.is_failed: raise RuntimeError("Executor failed.") @@ -269,20 +319,20 @@ def collective_rpc( # type: ignore[override] send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL) self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank)) - workers = ( - (self.workers[output_rank],) if output_rank is not None else self.workers - ) + response_mqs: Sequence[MessageQueue] = self.response_mqs + if output_rank is not None: + response_mqs = (response_mqs[output_rank],) shutdown_event = self.shutdown_event def get_response(): responses = [] - for w in workers: + for mq in response_mqs: dequeue_timeout = ( None if deadline is None else (deadline - time.monotonic()) ) try: - status, result = w.worker_response_mq.dequeue( + status, result = mq.dequeue( timeout=dequeue_timeout, cancel=shutdown_event ) except TimeoutError as e: @@ -391,17 +441,26 @@ class UnreadyWorkerProcHandle: class WorkerProcHandle: proc: BaseProcess rank: int - worker_response_mq: MessageQueue # The worker process writes to this MQ + # The worker process writes to this MQ in single-node mode + worker_response_mq: MessageQueue | None + # This is only non empty on driver node, + # the peer worker process i writes to MQ + # `peer_worker_response_mqs[i]` + peer_worker_response_mqs: list[MessageQueue | None] death_writer: Connection | None = None @classmethod def from_unready_handle( - cls, unready_handle: UnreadyWorkerProcHandle, worker_response_mq: MessageQueue + cls, + unready_handle: UnreadyWorkerProcHandle, + worker_response_mq: MessageQueue | None, + peer_worker_response_mqs: list[MessageQueue | None], ) -> "WorkerProcHandle": return cls( proc=unready_handle.proc, rank=unready_handle.rank, worker_response_mq=worker_response_mq, + peer_worker_response_mqs=peer_worker_response_mqs, death_writer=unready_handle.death_writer, ) @@ -411,6 +470,38 @@ class WorkerProc: READY_STR = "READY" + def _init_message_queues( + self, input_shm_handle: Handle, vllm_config: VllmConfig + ) -> None: + if vllm_config.parallel_config.nnodes_within_dp == 1: + # Initialize MessageQueue for receiving SchedulerOutput + self.rpc_broadcast_mq = MessageQueue.create_from_handle( + input_shm_handle, self.worker.rank + ) + + # Initializes a message queue for sending the model output + self.worker_response_mq: MessageQueue = MessageQueue(1, 1) + self.peer_response_handles = [] + else: + # Initialize remote MessageQueue for receiving SchedulerOutput across nodes + self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster( + external_writer_handle=input_shm_handle, + # Since there is external_writer_handle from executor proc, + # where the ready signal from actual writer is sent out of the + # create_mq_broadcaster method and after this setup, we make it + # non blocking. The handshake will be triggered when + # worker.rpc_broadcast_mq.wait_until_ready() is called + blocking=False, + ) + # Initializes remote message queue for sending the model output to the + # driver worker, exposing peer_response_handles for driver worker + # that include handles for all ranks + self.worker_response_mq, self.peer_response_handles = ( + get_inner_dp_world_group().create_single_reader_mq_broadcasters( + reader_rank_in_group=0 + ) + ) + def __init__( self, vllm_config: VllmConfig, @@ -421,13 +512,15 @@ def __init__( shared_worker_lock: LockType, ): self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) + wrapper = WorkerWrapperBase( + vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank + ) # TODO: move `init_worker` to executor level as a collective rpc call all_kwargs: list[dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) ] is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0 - all_kwargs[rank] = { + all_kwargs[local_rank] = { "vllm_config": vllm_config, "local_rank": local_rank, "rank": rank, @@ -438,14 +531,6 @@ def __init__( wrapper.init_worker(all_kwargs) self.worker = wrapper - # Initialize MessageQueue for receiving SchedulerOutput - self.rpc_broadcast_mq = MessageQueue.create_from_handle( - input_shm_handle, self.worker.rank - ) - - # Initializes a message queue for sending the model output - self.worker_response_mq = MessageQueue(1, 1) - scheduler_config = vllm_config.scheduler_config self.use_async_scheduling = scheduler_config.async_scheduling if self.use_async_scheduling: @@ -466,6 +551,7 @@ def __init__( ) # Load model + self._init_message_queues(input_shm_handle, vllm_config) self.worker.load_model() # Enable environment variable cache (e.g. assume no more @@ -512,6 +598,27 @@ def make_worker_process( # death_reader in child will get EOFError return UnreadyWorkerProcHandle(proc, rank, reader, death_writer) + @staticmethod + def wait_for_response_handle_ready( + handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle + ) -> WorkerProcHandle: + response_handle = handles["handle"] + worker_response_mq: MessageQueue | None = None + if len(response_handle.local_reader_ranks) > 0: + worker_response_mq = MessageQueue.create_from_handle(response_handle, 0) + peer_response_handles = handles["peer_response_handles"] + peer_worker_response_mqs = [ + MessageQueue.create_from_handle(handle, -1) + if handle.remote_subscribe_addr is not None + else None + for handle in peer_response_handles + ] + return WorkerProcHandle.from_unready_handle( + proc_handle, + worker_response_mq, + peer_worker_response_mqs=peer_worker_response_mqs, + ) + @staticmethod def wait_for_ready( unready_proc_handles: list[UnreadyWorkerProcHandle], @@ -537,16 +644,10 @@ def wait_for_ready( if response["status"] != "READY": raise e - # Extract the message queue handle. - worker_response_mq = MessageQueue.create_from_handle( - response["handle"], 0 - ) - ready_proc_handles[unready_proc_handle.rank] = ( - WorkerProcHandle.from_unready_handle( - unready_proc_handle, worker_response_mq - ) + idx = unready_proc_handle.rank % len(ready_proc_handles) + ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready( + response, unready_proc_handle ) - except EOFError: e.__suppress_context__ = True raise e from None @@ -618,12 +719,14 @@ def monitor_parent_death(): { "status": WorkerProc.READY_STR, "handle": worker.worker_response_mq.export_handle(), + "peer_response_handles": worker.peer_response_handles, } ) # Ensure message queues are ready. Will deadlock if re-ordered. # Must be kept consistent with the Executor - worker.rpc_broadcast_mq.wait_until_ready() + if worker.rpc_broadcast_mq is not None: + worker.rpc_broadcast_mq.wait_until_ready() worker.worker_response_mq.wait_until_ready() ready_writer.close() ready_writer = None diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 283e3744bcf6..42a844d96558 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -189,6 +189,7 @@ def init_device(self): and self.parallel_config.distributed_executor_backend not in ["ray", "external_launcher"] and self.vllm_config.parallel_config.data_parallel_backend != "ray" + and self.vllm_config.parallel_config.nnodes_within_dp == 1 ): # Use local DP rank if available, otherwise use global DP rank. dp_local_rank = self.parallel_config.data_parallel_rank_local @@ -205,7 +206,14 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - + visible_device_count = ( + torch.cuda.device_count() if torch.cuda.is_available() else 0 + ) + assert self.parallel_config.local_world_size <= visible_device_count, ( + f"local_world_size ({self.parallel_config.local_world_size}) must be " + f"less than or equal to the number of visible devices " + f"({visible_device_count})." + ) self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 3991c16eefba..16f321c08077 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -180,6 +180,7 @@ def __init__( self, vllm_config: VllmConfig, rpc_rank: int = 0, + global_rank: int | None = None, ) -> None: """ Initialize the worker wrapper with the given vllm_config and rpc_rank. @@ -192,6 +193,7 @@ def __init__( group. """ self.rpc_rank = rpc_rank + self.global_rank = self.rpc_rank if global_rank is None else global_rank self.worker: WorkerBase | None = None # do not store this `vllm_config`, `init_worker` will set the final @@ -312,7 +314,7 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: assert self.worker is not None def initialize_from_config(self, kv_cache_configs: list[Any]) -> None: - kv_cache_config = kv_cache_configs[self.rpc_rank] + kv_cache_config = kv_cache_configs[self.global_rank] with set_current_vllm_config(self.vllm_config): self.worker.initialize_from_config(kv_cache_config) # type: ignore From af02c409702f2f41eb13471ce3224e3315e19d89 Mon Sep 17 00:00:00 2001 From: Dezhan Date: Sun, 16 Nov 2025 01:46:29 -0800 Subject: [PATCH 111/578] Fixed gpt-oss _load_weights_other() parameter position bug (#28715) Co-authored-by: Dezhan Tu --- vllm/model_executor/models/gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 692ef605fe17..328c8c0ac4b7 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -641,8 +641,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) else: return self._load_weights_other( - ep_rank_end, ep_rank_start, + ep_rank_end, heads_per_rank, head_start, weights, From 3bc11757984ce256905d1b8517d50b514af8b175 Mon Sep 17 00:00:00 2001 From: scottzh8 Date: Sun, 16 Nov 2025 02:20:57 -0800 Subject: [PATCH 112/578] [Bugfix] Fix host and port join for ipv6 in bench serve (#28679) Signed-off-by: Scott Zhang Co-authored-by: Scott Zhang --- vllm/benchmarks/serve.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 0e9b0fbe2c02..dddb050ec180 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -49,6 +49,7 @@ from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils.gc_utils import freeze_gc_heap +from vllm.utils.network_utils import join_host_port MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -1333,8 +1334,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: api_url = f"{args.base_url}{args.endpoint}" base_url = f"{args.base_url}" else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" + host_port = join_host_port(args.host, args.port) + api_url = f"http://{host_port}{args.endpoint}" + base_url = f"http://{host_port}" # Headers headers = None From 8d259fad6cd5a93bef04d00640e132e84c0c9b20 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Sun, 16 Nov 2025 05:12:45 -0800 Subject: [PATCH 113/578] Fix gpt oss weight loading with EP + bf16 (#28765) Signed-off-by: ashors1 --- vllm/model_executor/models/gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 328c8c0ac4b7..7df3b087ccb8 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -494,8 +494,8 @@ def _load_weights_mxfp4( def _load_weights_other( self, - ep_rank_start: int, ep_rank_end: int, + ep_rank_start: int, heads_per_rank: int, head_start: int, weights: Iterable[tuple[str, torch.Tensor]], From 63fed5550609b96b578d2512aefced09efe76e1e Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Sun, 16 Nov 2025 15:30:06 +0100 Subject: [PATCH 114/578] [Doc]: fix typos in various files (#28811) Signed-off-by: Didier Durand --- docs/contributing/benchmarks.md | 2 +- docs/design/cuda_graphs.md | 2 +- docs/features/custom_arguments.md | 2 +- docs/features/custom_logitsprocs.md | 8 ++++---- docs/getting_started/installation/cpu.md | 2 +- docs/getting_started/installation/cpu.s390x.inc.md | 2 +- docs/getting_started/installation/cpu.x86.inc.md | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index ec0dfc4199d1..c9bc9cfe28a3 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -983,7 +983,7 @@ each document has close to 512 tokens. Please note that the `/v1/rerank` is also supported by embedding models. So if you're running with an embedding model, also set `--no_reranker`. Because in this case the query is -treated as a individual prompt by the server, here we send `random_batch_size - 1` documents +treated as an individual prompt by the server, here we send `random_batch_size - 1` documents to account for the extra prompt which is the query. The token accounting to report the throughput numbers correctly is also adjusted. diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index aac7b76eea26..66bf3b27d1f5 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap 3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create a new entry and cache it) or replay (if key exists in the cache). -The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106). +The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106). #### Nested Wrapper design diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md index 7a650d0e79c2..728a2c89901d 100644 --- a/docs/features/custom_arguments.md +++ b/docs/features/custom_arguments.md @@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code. !!! note - Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour. + Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour. ## Offline Custom Arguments diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 52fcc44efacc..5ddef9db1611 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous - * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch @@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh ## Ways to Load Your Custom Logits Processor in vLLM -Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests. +Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests. This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor. @@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`) ## Best Practices for Writing Custom Logits Processors -Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. +Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently. * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` @@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default -* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index e8bfca0e5e88..be99cef3723e 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels. ### Which `dtype` should be used? -- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem. +- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem. ### How to launch a vLLM service on CPU? diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md index 442c2b4ec64e..c2163139a7c5 100644 --- a/docs/getting_started/installation/cpu.s390x.inc.md +++ b/docs/getting_started/installation/cpu.s390x.inc.md @@ -2,7 +2,7 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform. -Currently the CPU implementation for s390x architecture supports FP32 datatype only. +Currently, the CPU implementation for s390x architecture supports FP32 datatype only. !!! warning There are no pre-built wheels or images for this device, so you must build vLLM from source. diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md index 00f3b726b1a0..310f179cb89c 100644 --- a/docs/getting_started/installation/cpu.x86.inc.md +++ b/docs/getting_started/installation/cpu.x86.inc.md @@ -83,7 +83,7 @@ uv pip install dist/*.whl !!! example "Troubleshooting" - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`. - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed. - - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU. + - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU. - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency. ```toml title="pyproject.toml" [build-system] From ac1daf32337d312e7a575901da2e19857f4c0be1 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 17 Nov 2025 01:03:21 +0800 Subject: [PATCH 115/578] fix comment typo (#28802) Signed-off-by: Andy Xie --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 7987e5fb83fd..6bf05803e14e 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -423,7 +423,7 @@ def get_vllm_port() -> int | None: raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err -# The begin-* and end* here are used by the documentation generator +# The start-* and end* here are used by the documentation generator # to extract the used env vars. # --8<-- [start:env-vars-definition] From 5a87076d6ee60a2cf681dada9e971b4ee3e6063e Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sun, 16 Nov 2025 17:37:15 +0000 Subject: [PATCH 116/578] [Model][QwenVL] Optimize `Qwen2_5_VisionAttention` q,k preparation (#28769) Signed-off-by: Lukas Geiger Co-authored-by: Isotr0py --- vllm/model_executor/models/dots_ocr.py | 4 +- vllm/model_executor/models/qwen2_5_vl.py | 48 ++++++++++++------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 405af8f8be42..f46caaa095c6 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -39,8 +39,8 @@ ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM -from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention from vllm.model_executor.models.qwen2_vl import ( + Qwen2VisionAttention, Qwen2VLDummyInputsBuilder, Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, @@ -328,7 +328,7 @@ def forward( # [S, C] -> [S, B=1, C] x = hidden_states.unsqueeze(1) x, _ = self.qkv(x) - q, k, v = Qwen2_5_VisionAttention.split_qkv(self, x) + q, k, v = Qwen2VisionAttention.split_qkv(self, x) bs = q.shape[1] # [S,B,H,D] -> [B,S,H,D] q = q.permute(1, 0, 2, 3).contiguous() diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 7617929e93ac..897dd7ef29f1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -359,23 +359,6 @@ def __init__( AttentionBackendEnum.ROCM_AITER_FA, } - def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: - # [s, b, 3 * head * head_dim] - seq_len, bs, _ = qkv.shape - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] - q, k, v = qkv.chunk(3, dim=2) - - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = ( - seq_len, - bs, - self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head, - ) - q, k, v = (x.view(*new_shape) for x in (q, k, v)) - return q, k, v - def forward( self, x: torch.Tensor, @@ -386,17 +369,32 @@ def forward( ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) + seq_len, batch_size, _ = x.shape - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] + qkv = einops.rearrange( + x, + "s b (three head head_dim) -> b s three head head_dim", + three=3, + head=self.num_attention_heads_per_partition, + ) - q, k, v = (einops.rearrange(x, "s b ... -> b s ...") for x in (q, k, v)) if rotary_pos_emb is not None: - # [2 * b, s, heads, head_dim] - qk_concat = torch.cat([q, k], dim=0) - qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) - q, k = torch.chunk(qk_rotated, 2, dim=0) + qk, v = qkv[:, :, :2], qkv[:, :, 2] + + qk_reshaped = einops.rearrange( + qk, "b s two head head_dim -> (two b) s head head_dim", two=2 + ) + qk_rotated = apply_rotary_pos_emb_vision(qk_reshaped, rotary_pos_emb) + qk_rotated = qk_rotated.view( + 2, + batch_size, + seq_len, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + q, k = qk_rotated.unbind(dim=0) + else: + q, k, v = qkv.unbind(dim=2) if self.is_flash_attn_backend: context_layer = vit_flash_attn_wrapper( From 03ee48111de7372a1231872f26262e7c46ab1c83 Mon Sep 17 00:00:00 2001 From: amirkl94 <203507526+amirkl94@users.noreply.github.com> Date: Sun, 16 Nov 2025 20:39:44 +0200 Subject: [PATCH 117/578] Feature: Support Relu2 in FusedMoE fp8 cutlass path (#27261) --- tests/kernels/moe/test_flashinfer.py | 18 +++++++--- .../fused_moe/flashinfer_cutlass_moe.py | 11 +++++-- .../layers/quantization/modelopt.py | 33 +++++++++++-------- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 3a681d4603f8..218df4a2632c 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -77,10 +77,14 @@ class TestData: @staticmethod def make_moe_tensors_8bit( - m: int, k: int, n: int, e: int, reorder: bool + m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu" ) -> "TestData": + is_gated = activation != "relu2_no_mul" + hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 - w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) + w13 = torch.randn( + (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16 + ) w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) # Scale to fp8 @@ -190,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"]) def test_flashinfer_cutlass_moe_fp8_no_graph( m: int, n: int, k: int, e: int, topk: int, + activation: str, monkeypatch, ): current_platform.seed_everything(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): - td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False) + td = TestData.make_moe_tensors_8bit( + m, k, n, e, reorder=False, activation=activation + ) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) topk_weights, topk_ids, _ = FusedMoE.select_experts( @@ -233,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - activation="silu", + activation=activation, global_num_experts=e, expert_map=None, apply_router_weight_on_input=True, @@ -253,7 +261,7 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig: td.layer, topk_weights, topk_ids, - activation="silu", + activation=activation, global_num_experts=e, expert_map=None, apply_router_weight_on_input=True, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 943695f921ad..f864634c6617 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -148,8 +148,14 @@ def apply( expert_tokens_meta: mk.ExpertTokensMetadata | None, apply_router_weight_on_input: bool | None, ): - assert activation == "silu", ( - "Only activation silu is supported in FlashInferExperts" + from flashinfer.fused_moe.core import ActivationType + + activation_str_to_value_map = { + "silu": ActivationType.Swiglu, # This is the default + "relu2_no_mul": ActivationType.Relu2, + } + assert activation in activation_str_to_value_map, ( + f"{activation=} missing from {activation_str_to_value_map.keys()=}" ) # Select quantization metadata based on FP8 format/path @@ -215,6 +221,7 @@ def apply( ep_size=self.ep_size, ep_rank=self.ep_rank, output=output, + activation_type=activation_str_to_value_map[activation], # Informs FlashInfer to use the block-scale decoding path when True use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e14753c60c48..cf6325eb85df 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -354,12 +354,18 @@ def __init__( self.cutlass_fp8_supported = cutlass_fp8_supported() self.flashinfer_moe_backend: FlashinferMoeBackend | None = None - if ( - envs.VLLM_USE_FLASHINFER_MOE_FP8 - and has_flashinfer_moe() - and self.moe.is_act_and_mul - ): + if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe(): self.flashinfer_moe_backend = get_flashinfer_moe_backend() + if ( + self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + and not self.moe.is_act_and_mul + ): + logger.info_once( + "Non-gated MoE is not supported for min-latency mode," + "falling back to high-throughput mode" + ) + self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS + logger.info_once( f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" ) @@ -557,10 +563,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) if self.flashinfer_moe_backend is not None: - layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) - register_moe_scaling_factors(layer) + if self.moe.is_act_and_mul: + layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) + register_moe_scaling_factors(layer) def get_fused_moe_quant_config( self, layer: torch.nn.Module @@ -570,13 +577,13 @@ def get_fused_moe_quant_config( return fp8_w8a8_moe_quant_config( w1_scale=layer.w13_weight_scale, - g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(), + g1_alphas=layer.output1_scales_gate_scalar.squeeze(), w2_scale=layer.w2_weight_scale, - g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(), + g2_alphas=layer.output2_scales_scalar.squeeze(), a1_scale=layer.w13_input_scale, a1_gscale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, - a2_gscale=1.0 / layer.w2_input_scale, + a2_gscale=layer.w2_input_scale_inv, per_act_token_quant=False, ) @@ -642,9 +649,9 @@ def apply( ) if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert not renormalize - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + assert activation in ("silu", "relu2_no_mul"), ( + "Expected activation to be in ('silu', 'relu2_no_mul')," + f"but got {activation}" ) return flashinfer_cutlass_moe_fp8( x, From 80b6080ddcad0653daa6b776eb71a5a7029b70d8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 16 Nov 2025 14:46:46 -0800 Subject: [PATCH 118/578] [BugFix] Fix async scheduling + chunked prefill + preemption (#28787) Signed-off-by: Nick Hill --- tests/v1/e2e/test_async_scheduling.py | 10 ++++------ vllm/v1/core/sched/scheduler.py | 4 +--- vllm/v1/utils.py | 3 +++ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index dbe403ece051..c4aca82416cd 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -65,9 +65,8 @@ def test_without_spec_decoding( (True, "mp", True, None, False), (True, "uni", True, None, False), (False, "mp", True, None, True), - # Async scheduling + preemption + chunked prefill needs to be fixed (WIP) - # (True, "mp", True, None, True), - # (True, "uni", True, None, True), + (True, "mp", True, None, True), + (True, "uni", True, None, True), ] run_tests( @@ -103,9 +102,8 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): (False, "mp", True, spec_config_short, True), (True, "uni", True, spec_config, False), (True, "uni", True, spec_config_short, False), - # Async scheduling + preemption + chunked prefill needs to be fixed (WIP) - # (True, "mp", True, spec_config, True), - # (True, "uni", True, spec_config_short, True), + (True, "mp", True, spec_config, True), + (True, "uni", True, spec_config_short, True), ] run_tests( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index bc15979dea62..8e62542337a7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -778,9 +778,7 @@ def _make_cached_request_data( assert not scheduled_in_prev_step resumed_req_ids.add(req_id) if not scheduled_in_prev_step: - all_token_ids[req_id] = req.all_token_ids[ - : req.num_computed_tokens + num_tokens - ] + all_token_ids[req_id] = req.all_token_ids.copy() new_block_ids.append( req_to_new_blocks[req_id].get_block_ids(allow_none=True) ) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index a401f6d74cdd..29099d1e9b17 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -97,6 +97,9 @@ def __len__(self): def __repr__(self): return f"ConstantList({self._x})" + def copy(self) -> list[T]: + return self._x.copy() + class CpuGpuBuffer: """Buffer to easily copy tensors between CPU and GPU.""" From 561253b37faadaafe68168ea32d8d8157621a6b4 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Sun, 16 Nov 2025 18:02:42 -0800 Subject: [PATCH 119/578] [Performance][Fix] update nvfp4 code to support renorm routing (#28569) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Michael Goin --- .../layers/quantization/modelopt.py | 18 +++++++++++------- .../quantization/utils/flashinfer_utils.py | 5 ++++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index cf6325eb85df..476521813f46 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, + RoutingMethodType, fp8_w8a8_moe_quant_config, nvfp4_moe_quant_config, ) @@ -1657,16 +1658,19 @@ def apply( use_llama4_routing = ( custom_routing_function is Llama4MoE.custom_routing_function ) - routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3 + routing_method_type = layer.routing_method_type if use_llama4_routing: - routing_method_type = flashinfer.RoutingMethodType.Llama4 + routing_method_type = RoutingMethodType.Llama4 + router_logits = ( + router_logits.to(torch.float32) + if routing_method_type == RoutingMethodType.DeepSeekV3 + else router_logits + ) routing_bias = e_score_correction_bias if routing_bias is not None: routing_bias = routing_bias.to(torch.bfloat16) out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( - routing_logits=router_logits - if use_llama4_routing - else router_logits.to(torch.float32), + routing_logits=router_logits, routing_bias=routing_bias, hidden_states=hidden_states_fp4, hidden_states_scale=hidden_states_scale_linear_fp4.view( @@ -1690,8 +1694,8 @@ def apply( output2_scale_scalar=layer.g2_alphas.data, num_experts=global_num_experts, top_k=top_k, - n_group=num_expert_group if num_expert_group is not None else 0, - topk_group=topk_group if topk_group is not None else 0, + n_group=num_expert_group, + topk_group=topk_group, intermediate_size=layer.intermediate_size_per_partition, local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index d9e9b4240271..f22e17945d1f 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -291,5 +291,8 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool: # TODO(shuw@nvidia): Update when new backends are added. - backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,) + backends_supporting_global_sf = ( + FlashinferMoeBackend.CUTLASS, + FlashinferMoeBackend.TENSORRT_LLM, + ) return backend in backends_supporting_global_sf From d64429bb369d4087f9f91609e7275c4901d65aea Mon Sep 17 00:00:00 2001 From: liuzhenwei Date: Mon, 17 Nov 2025 11:01:33 +0800 Subject: [PATCH 120/578] [NIXL][XPU] update install script of NIXL (#28778) Signed-off-by: zhenwei-intel --- docker/Dockerfile.xpu | 3 ++- tools/install_nixl_from_source_ubuntu.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 4e6ef8f5ca13..5d5b82c4fa5a 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \ libxext6 \ libgl1 \ lsb-release \ + libaio-dev \ numactl \ wget \ vim \ @@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN python3 -m pip install -e tests/vllm_test_utils # install nixl from source code +ENV NIXL_VERSION=0.7.0 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/" RUN --mount=type=cache,target=/root/.cache/pip \ pip uninstall oneccl oneccl-devel -y diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index 4a20b6b7bb8f..a786abba95ad 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -175,6 +175,7 @@ def build_and_install_prerequisites(args): build_env["LD_LIBRARY_PATH"] = ( f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":") ) + build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN" print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True) temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse") From 60e089f0b90b1fe9b65224b069c953927d1f3b44 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Mon, 17 Nov 2025 12:52:11 +0800 Subject: [PATCH 121/578] [ROCm][Qwen3-32B] Fix AITER MHA accuracy issue cause by #25763 (#28670) Signed-off-by: Xiake Sun --- vllm/v1/attention/backends/rocm_aiter_fa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index ad454daa582e..ea611848b0e8 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -729,7 +729,7 @@ def forward( cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc, max_seqlen_q=attn_metadata.prefill_metadata.max_query_len, max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len, - min_seqlen_q=attn_metadata.prefill_metadata.min_query_len, + min_seqlen_q=1, dropout_p=0.0, softmax_scale=self.scale, causal=True, @@ -759,7 +759,7 @@ def forward( cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc, max_seqlen_q=attn_metadata.extend_metadata.max_query_len, max_seqlen_k=attn_metadata.extend_metadata.max_seq_len, - min_seqlen_q=attn_metadata.extend_metadata.min_query_len, + min_seqlen_q=1, block_table=attn_metadata.block_table[ num_decodes : num_decodes + num_extends ], From 6f374192442381b37a6a6ba29045c74a8ee2486d Mon Sep 17 00:00:00 2001 From: Jay Caldwell <111952840+jscaldwell55@users.noreply.github.com> Date: Sun, 16 Nov 2025 23:54:46 -0600 Subject: [PATCH 122/578] [Bugfix][Model] Prevent special token leakage in KimiK2ToolParser streaming mode (#28543) Signed-off-by: Jscaldwell55 --- tests/tool_use/test_kimi_k2_tool_parser.py | 593 ++++++++++++++++++ .../tool_parsers/kimi_k2_tool_parser.py | 202 +++++- 2 files changed, 790 insertions(+), 5 deletions(-) diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py index c358589dbc29..33dabbc7e7b9 100644 --- a/tests/tool_use/test_kimi_k2_tool_parser.py +++ b/tests/tool_use/test_kimi_k2_tool_parser.py @@ -209,3 +209,596 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser): assert result is not None assert hasattr(result, "content") assert result.content == " without any tool calls." + + +def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser): + """ + Test that text between <|tool_calls_section_begin|> and <|tool_call_begin|> + is suppressed and does not leak into reasoning_delta. + This is the main vulnerability being fixed. + """ + kimi_k2_tool_parser.reset_streaming_state() + + # Get token IDs for the markers + section_begin_token_id = kimi_k2_tool_parser.vocab.get( + "<|tool_calls_section_begin|>" + ) + tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>") + + # Simulate streaming sequence: + # Delta 1: "I'll help you with that. " + result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="I'll help you with that. ", + delta_text="I'll help you with that. ", + previous_token_ids=[], + current_token_ids=[1, 2, 3], # Regular tokens + delta_token_ids=[1, 2, 3], + request=None, + ) + assert result1 is not None + assert result1.content == "I'll help you with that. " + + # Delta 2: "<|tool_calls_section_begin|>" + prev_ids = [1, 2, 3] + curr_ids = prev_ids + [section_begin_token_id] + result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="I'll help you with that. ", + current_text="I'll help you with that. <|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=prev_ids, + current_token_ids=curr_ids, + delta_token_ids=[section_begin_token_id], + request=None, + ) + # Section marker should be stripped and suppressed + assert result2 is None or (result2.content is None or result2.content == "") + + # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO) + prev_ids = curr_ids + curr_ids = curr_ids + [4, 5] + result3 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="I'll help you with that. <|tool_calls_section_begin|>", + current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ", + delta_text=" spurious text ", + previous_token_ids=prev_ids, + current_token_ids=curr_ids, + delta_token_ids=[4, 5], + request=None, + ) + # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta + assert result3 is None or (result3.content is None or result3.content == "") + + # Delta 4: "<|tool_call_begin|>..." + prev_ids = curr_ids + curr_ids = curr_ids + [tool_call_begin_token_id] + _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ", + current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>", + delta_text="<|tool_call_begin|>", + previous_token_ids=prev_ids, + current_token_ids=curr_ids, + delta_token_ids=[tool_call_begin_token_id], + request=None, + ) + # Now we're in tool call mode, result depends on internal state + # The key is that the spurious text from Delta 3 was not leaked + + +def test_split_markers_across_deltas(kimi_k2_tool_parser): + """ + Test that markers split across delta chunks are correctly detected + via the rolling buffer mechanism. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_token_id = kimi_k2_tool_parser.vocab.get( + "<|tool_calls_section_begin|>" + ) + + # Delta 1: "...reasoning<|tool_calls_sec" + _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Some reasoning", + current_text="Some reasoning<|tool_calls_sec", + delta_text="<|tool_calls_sec", + previous_token_ids=[1, 2], + current_token_ids=[1, 2, 3], # Partial token + delta_token_ids=[3], + request=None, + ) + # Partial token not recognized yet, might be buffered + # Should return as content or None (depends on implementation) + + # Delta 2: "tion_begin|> " (completes the marker) + _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Some reasoning<|tool_calls_sec", + current_text="Some reasoning<|tool_calls_section_begin|> ", + delta_text="tion_begin|> ", + previous_token_ids=[1, 2, 3], + current_token_ids=[1, 2, section_begin_token_id, 4], + delta_token_ids=[section_begin_token_id, 4], + request=None, + ) + # Now the complete marker should be detected via buffer + # The parser should enter tool section mode + assert kimi_k2_tool_parser.in_tool_section is True + + +def test_marker_variants(kimi_k2_tool_parser): + """Test that both singular and plural marker variants are recognized.""" + kimi_k2_tool_parser.reset_streaming_state() + + # Test singular variant: <|tool_call_section_begin|> (note: singular "call") + singular_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_section_begin|>") + + if singular_token_id is not None: # Only test if tokenizer supports it + _result = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning ", + current_text="Reasoning <|tool_call_section_begin|>", + delta_text="<|tool_call_section_begin|>", + previous_token_ids=[1, 2], + current_token_ids=[1, 2, singular_token_id], + delta_token_ids=[singular_token_id], + request=None, + ) + # Should enter tool section mode with singular variant too + assert kimi_k2_tool_parser.in_tool_section is True + + +def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser): + """ + Test that after exiting a tool section with <|tool_calls_section_end|>, + subsequent text is correctly returned as reasoning content. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>") + + # Enter tool section + _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="<|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=[], + current_token_ids=[section_begin_id], + delta_token_ids=[section_begin_id], + request=None, + ) + assert kimi_k2_tool_parser.in_tool_section is True + + # Exit tool section + _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="<|tool_calls_section_begin|>", + current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>", + delta_text="<|tool_calls_section_end|>", + previous_token_ids=[section_begin_id], + current_token_ids=[section_begin_id, section_end_id], + delta_token_ids=[section_end_id], + request=None, + ) + assert kimi_k2_tool_parser.in_tool_section is False + + # Subsequent reasoning text should be returned normally + result3 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>", + current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning", + delta_text=" More reasoning", + previous_token_ids=[section_begin_id, section_end_id], + current_token_ids=[section_begin_id, section_end_id, 10, 11], + delta_token_ids=[10, 11], + request=None, + ) + assert result3 is not None + assert result3.content == " More reasoning" + + +def test_empty_tool_section(kimi_k2_tool_parser): + """Test an empty tool section (begin immediately followed by end).""" + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>") + + # Section begin + _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning ", + current_text="Reasoning <|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=[1], + current_token_ids=[1, section_begin_id], + delta_token_ids=[section_begin_id], + request=None, + ) + + # Immediate section end + _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning <|tool_calls_section_begin|>", + current_text="Reasoning <|tool_calls_section_begin|><|tool_calls_section_end|>", + delta_text="<|tool_calls_section_end|>", + previous_token_ids=[1, section_begin_id], + current_token_ids=[1, section_begin_id, section_end_id], + delta_token_ids=[section_end_id], + request=None, + ) + # Should exit cleanly without errors + assert kimi_k2_tool_parser.in_tool_section is False + + +def test_malformed_tool_section_recovery(kimi_k2_tool_parser): + """ + Test that the parser recovers from a malformed tool section + that never closes properly. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + + # Enter tool section + _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="<|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=[], + current_token_ids=[section_begin_id], + delta_token_ids=[section_begin_id], + request=None, + ) + assert kimi_k2_tool_parser.in_tool_section is True + + # Simulate a lot of text without proper tool calls or section end + # This should trigger the error recovery mechanism + large_text = "x" * 10000 # Exceeds max_section_chars + + result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="<|tool_calls_section_begin|>", + current_text="<|tool_calls_section_begin|>" + large_text, + delta_text=large_text, + previous_token_ids=[section_begin_id], + current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))), + delta_token_ids=list(range(100, 100 + len(large_text))), + request=None, + ) + + # Parser should have force-exited the tool section + assert kimi_k2_tool_parser.in_tool_section is False + # And returned the content as reasoning + assert result2 is not None + assert result2.content == large_text + + +def test_state_reset(kimi_k2_tool_parser): + """Test that reset_streaming_state() properly clears all state.""" + # Put parser in a complex state + kimi_k2_tool_parser.in_tool_section = True + kimi_k2_tool_parser.token_buffer = "some buffer" + kimi_k2_tool_parser.current_tool_id = 5 + kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}] + kimi_k2_tool_parser.section_char_count = 1000 + + # Reset + kimi_k2_tool_parser.reset_streaming_state() + + # Verify all state is cleared + assert kimi_k2_tool_parser.in_tool_section is False + assert kimi_k2_tool_parser.token_buffer == "" + assert kimi_k2_tool_parser.current_tool_id == -1 + assert kimi_k2_tool_parser.prev_tool_call_arr == [] + assert kimi_k2_tool_parser.section_char_count == 0 + assert kimi_k2_tool_parser.current_tool_name_sent is False + assert kimi_k2_tool_parser.streamed_args_for_tool == [] + + +def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser): + """ + Test that begin→noise→tool_begin within the SAME chunk suppresses + the noise text correctly (not just across chunks). + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + tool_call_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>") + + # Single delta containing: section_begin + spurious text + tool_call_begin + combined_text = "<|tool_calls_section_begin|> noise text <|tool_call_begin|>" + + result = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning ", + current_text="Reasoning " + combined_text, + delta_text=combined_text, + previous_token_ids=[1, 2], + current_token_ids=[1, 2, section_begin_id, 3, 4, tool_call_begin_id], + delta_token_ids=[section_begin_id, 3, 4, tool_call_begin_id], + request=None, + ) + + # The noise text should NOT leak into content + # Result should either be None/empty or start tool call parsing + if result is not None and result.content is not None: + # If content is returned, it should not contain the noise + assert "noise text" not in result.content + assert result.content == "" or result.content.strip() == "" + + +def test_stream_ends_without_section_end_marker(kimi_k2_tool_parser): + """ + Test that if the stream ends (EOF) without a proper section end marker, + the parser doesn't leak text, doesn't crash, and resets state cleanly. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + + # Enter tool section + _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="<|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=[], + current_token_ids=[section_begin_id], + delta_token_ids=[section_begin_id], + request=None, + ) + assert kimi_k2_tool_parser.in_tool_section is True + + # Some content in tool section + result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="<|tool_calls_section_begin|>", + current_text="<|tool_calls_section_begin|> partial content", + delta_text=" partial content", + previous_token_ids=[section_begin_id], + current_token_ids=[section_begin_id, 10, 11], + delta_token_ids=[10, 11], + request=None, + ) + # Content should be suppressed + assert result2.content == "" or result2.content is None + + # Stream ends (EOF) - no more deltas, no section_end marker + # Simulate this by manually checking state and resetting + # (In real usage, the request handler would call reset_streaming_state) + assert kimi_k2_tool_parser.in_tool_section is True # Still in section + + # Reset state (as would happen between requests) + kimi_k2_tool_parser.reset_streaming_state() + + # Verify clean slate + assert kimi_k2_tool_parser.in_tool_section is False + assert kimi_k2_tool_parser.token_buffer == "" + + # Next request should work normally + result3 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="New reasoning", + delta_text="New reasoning", + previous_token_ids=[], + current_token_ids=[20, 21], + delta_token_ids=[20, 21], + request=None, + ) + assert result3 is not None + assert result3.content == "New reasoning" + + +def test_same_chunk_begin_and_end_markers(kimi_k2_tool_parser): + """ + CRITICAL TEST: Verify that when both section_begin and section_end + markers appear in the SAME chunk, the parser correctly: + 1. Enters the tool section + 2. Immediately exits the tool section + 3. Does NOT get stuck in in_tool_section=True state + + This tests the bug fix where elif was changed to if to handle + both state transitions in a single delta. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>") + + # Single chunk with both markers (e.g., empty tool section) + combined_delta = "<|tool_calls_section_begin|><|tool_calls_section_end|>" + + result = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Some reasoning ", + current_text="Some reasoning " + combined_delta, + delta_text=combined_delta, + previous_token_ids=[1, 2], + current_token_ids=[1, 2, section_begin_id, section_end_id], + delta_token_ids=[section_begin_id, section_end_id], + request=None, + ) + + # CRITICAL: Parser should NOT be stuck in tool section + assert kimi_k2_tool_parser.in_tool_section is False, ( + "Parser stuck in tool section after processing both begin/end in same chunk. " + "This indicates the elif bug was not fixed." + ) + + # Result should be empty or contain only stripped content + assert result is not None + assert result.content == "" or result.content is None + + # Verify subsequent content streams correctly (not suppressed) + result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Some reasoning " + combined_delta, + current_text="Some reasoning " + combined_delta + " More reasoning", + delta_text=" More reasoning", + previous_token_ids=[1, 2, section_begin_id, section_end_id], + current_token_ids=[1, 2, section_begin_id, section_end_id, 10, 11], + delta_token_ids=[10, 11], + request=None, + ) + + # This content should NOT be suppressed (we're out of tool section) + assert result2 is not None + assert result2.content == " More reasoning" + + +def test_same_chunk_begin_content_end_markers(kimi_k2_tool_parser): + """ + Test the same-chunk scenario with actual content between markers. + Example: <|tool_calls_section_begin|> text <|tool_calls_section_end|> + all arriving in one delta. The key is that the state machine correctly + transitions in and out within the same chunk. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>") + + # Chunk with begin, some whitespace/noise, and end all together + # This simulates a tool section that opens and closes in the same chunk + combined_delta = "<|tool_calls_section_begin|> <|tool_calls_section_end|>" + + _result = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning ", + current_text="Reasoning " + combined_delta, + delta_text=combined_delta, + previous_token_ids=[1], + current_token_ids=[1, section_begin_id, 100, section_end_id], + delta_token_ids=[section_begin_id, 100, section_end_id], + request=None, + ) + + # Parser should exit cleanly (not stuck in tool section) + assert kimi_k2_tool_parser.in_tool_section is False + + # Verify the fix: next content should stream normally, not be suppressed + result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Reasoning " + combined_delta, + current_text="Reasoning " + combined_delta + " Done", + delta_text=" Done", + previous_token_ids=[1, section_begin_id, 100, section_end_id], + current_token_ids=[1, section_begin_id, 100, section_end_id, 200], + delta_token_ids=[200], + request=None, + ) + + # Content after section should be returned (not suppressed) + assert result2 is not None + assert result2.content == " Done" + + +def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser): + """ + CRITICAL TEST (P1): Verify that when both <|tool_call_end|> and + <|tool_calls_section_end|> appear in the SAME chunk, the parser: + 1. Processes the tool_call_end first (emits final arguments) + 2. THEN exits the section + 3. Does NOT drop the final tool call update + 4. Does NOT leak special tokens into reasoning + + This tests the deferred section exit fix. + """ + kimi_k2_tool_parser.reset_streaming_state() + + section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>") + section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>") + tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>") + tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>") + + # Simulate a streaming sequence for a SHORT tool call (all in one chunk): + # 1. Reasoning text + result1 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text="Let me help. ", + delta_text="Let me help. ", + previous_token_ids=[], + current_token_ids=[1, 2], + delta_token_ids=[1, 2], + request=None, + ) + assert result1 is not None + assert result1.content == "Let me help. " + + # 2. Section begin + _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text="Let me help. ", + current_text="Let me help. <|tool_calls_section_begin|>", + delta_text="<|tool_calls_section_begin|>", + previous_token_ids=[1, 2], + current_token_ids=[1, 2, section_begin_id], + delta_token_ids=[section_begin_id], + request=None, + ) + assert kimi_k2_tool_parser.in_tool_section is True + + # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK + # This is the critical scenario for short tool calls + combined = ( + '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} ' + "<|tool_call_end|><|tool_calls_section_end|>" + ) + + # Build up the previous text gradually to simulate realistic streaming + prev_text = "Let me help. <|tool_calls_section_begin|>" + curr_text = prev_text + combined + + result3 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text=prev_text, + current_text=curr_text, + delta_text=combined, + previous_token_ids=[1, 2, section_begin_id], + current_token_ids=[ + 1, + 2, + section_begin_id, + tool_begin_id, + 10, + 11, + 12, + tool_end_id, + section_end_id, + ], + delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id], + request=None, + ) + + # CRITICAL: Parser should have exited section AFTER processing tool + assert kimi_k2_tool_parser.in_tool_section is False + + # Tool call should have been emitted (not dropped) + # The result might be the tool name or None depending on state, but + # importantly, it shouldn't be returning the literal tokens as content + + if result3 is not None and result3.content is not None: + # Verify no special tokens leaked into content + assert "<|tool_call_end|>" not in result3.content + assert "<|tool_calls_section_end|>" not in result3.content + + # 4. Verify subsequent content streams normally + result4 = kimi_k2_tool_parser.extract_tool_calls_streaming( + previous_text=curr_text, + current_text=curr_text + " Done", + delta_text=" Done", + previous_token_ids=[ + 1, + 2, + section_begin_id, + tool_begin_id, + 10, + 11, + 12, + tool_end_id, + section_end_id, + ], + current_token_ids=[ + 1, + 2, + section_begin_id, + tool_begin_id, + 10, + 11, + 12, + tool_end_id, + section_end_id, + 20, + ], + delta_token_ids=[20], + request=None, + ) + + # Content after tool section should stream normally + assert result4 is not None + assert result4.content == " Done" diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index 0453db58361a..a84c9e454716 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -34,8 +34,27 @@ def __init__(self, tokenizer: AnyTokenizer): str ] = [] # map what has been streamed for each tool so far to a list + # Section-level state management to prevent token leakage + self.in_tool_section: bool = False + self.token_buffer: str = "" + # Buffer size: empirical worst-case for longest marker (~30 chars) * 2 + # + safety margin for unicode + partial overlap. Prevents unbounded growth. + self.buffer_max_size: int = 1024 + self.section_char_count: int = 0 # Track characters processed in tool section + self.max_section_chars: int = 8192 # Force exit if section exceeds this + self._buffer_overflow_logged: bool = False # Log overflow once per session + + # Support both singular and plural variants self.tool_calls_start_token: str = "<|tool_calls_section_begin|>" self.tool_calls_end_token: str = "<|tool_calls_section_end|>" + self.tool_calls_start_token_variants: list[str] = [ + "<|tool_calls_section_begin|>", + "<|tool_call_section_begin|>", # singular variant + ] + self.tool_calls_end_token_variants: list[str] = [ + "<|tool_calls_section_end|>", + "<|tool_call_section_end|>", # singular variant + ] self.tool_call_start_token: str = "<|tool_call_begin|>" self.tool_call_end_token: str = "<|tool_call_end|>" @@ -58,6 +77,18 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token) self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token) + # Get token IDs for all variants + self.tool_calls_start_token_ids: list[int] = [ + tid + for variant in self.tool_calls_start_token_variants + if (tid := self.vocab.get(variant)) is not None + ] + self.tool_calls_end_token_ids: list[int] = [ + tid + for variant in self.tool_calls_end_token_variants + if (tid := self.vocab.get(variant)) is not None + ] + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) @@ -70,6 +101,51 @@ def __init__(self, tokenizer: AnyTokenizer): "tokens in the tokenizer!" ) + def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]: + """ + Check for section begin/end markers in text and strip them. + Returns: (cleaned_text, found_section_begin, found_section_end) + """ + found_begin = False + found_end = False + cleaned = text + + # Check for section begin markers (any variant) + for variant in self.tool_calls_start_token_variants: + if variant in cleaned: + cleaned = cleaned.replace(variant, "") + found_begin = True + + # Check for section end markers (any variant) + for variant in self.tool_calls_end_token_variants: + if variant in cleaned: + cleaned = cleaned.replace(variant, "") + found_end = True + + return cleaned, found_begin, found_end + + def _reset_section_state(self) -> None: + """Reset state when exiting tool section.""" + self.in_tool_section = False + self.token_buffer = "" + self.section_char_count = 0 + + def reset_streaming_state(self) -> None: + """ + Reset all streaming state. Call this between requests to prevent + state leakage when parser instance is reused. + """ + # Reset section state + self._reset_section_state() + + # Reset parent class state + self.current_tool_name_sent = False + self.prev_tool_call_arr = [] + self.current_tool_id = -1 + self.streamed_args_for_tool = [] + + logger.debug("Streaming state reset") + def extract_tool_calls( self, model_output: str, @@ -131,13 +207,94 @@ def extract_tool_calls_streaming( ) -> DeltaMessage | None: logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) - # check to see if we should be streaming a tool call - is there a - if self.tool_calls_start_token_id not in current_token_ids: + + # Flag to defer section exit until after tool parsing completes + deferred_section_exit = False + + # Add delta to buffer for split marker detection + self.token_buffer += delta_text + + # Enforce buffer size limit to prevent memory issues + if len(self.token_buffer) > self.buffer_max_size: + if not self._buffer_overflow_logged: + logger.warning( + "Token buffer exceeded max size (%d bytes), flushing excess. " + "This may indicate very long markers or unusual tokenization.", + self.buffer_max_size, + ) + self._buffer_overflow_logged = True + # Keep only the most recent content that might contain partial markers + self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :] + + # Check buffer for section markers (handles split tokens) + buffered_text, found_section_begin, found_section_end = ( + self._check_and_strip_markers(self.token_buffer) + ) + + # Track section state transitions + if found_section_begin and not self.in_tool_section: + logger.debug("Entering tool section") + self.in_tool_section = True + self.token_buffer = buffered_text # Use cleaned buffer + self.section_char_count = 0 # Reset counter for new section + if found_section_end and self.in_tool_section: + logger.debug("Detected section end marker") + # CRITICAL: Don't exit early if tool_call_end is in this chunk. + # Tool parser must emit final arguments/close first to avoid dropping + # the final tool update and leaking tokens into reasoning channel. + has_tool_end = self.tool_call_end_token_id in delta_token_ids + if has_tool_end: + # Defer exit until after tool parsing completes + deferred_section_exit = True + logger.debug("Deferring section exit: tool_call_end in same chunk") + self.token_buffer = buffered_text + else: + # No tool call ending, safe to exit immediately + logger.debug("Exiting tool section") + remaining = buffered_text + self._reset_section_state() + # Return remaining text as reasoning content if non-empty + if remaining.strip(): + return DeltaMessage(content=remaining) + # Return empty delta to maintain function contract + # (always returns DeltaMessage) + return DeltaMessage(content="") + else: + self.token_buffer = buffered_text + + # Check if any variant of section start token is in current_token_ids + has_section_token = any( + tid in current_token_ids for tid in self.tool_calls_start_token_ids + ) + + # Early return: if no section token detected yet, return as reasoning content + if not has_section_token and not self.in_tool_section: logger.debug("No tool call tokens found!") + # Don't clear buffer - it needs to accumulate partial markers across deltas + # Buffer overflow is already protected by lines 215-224 return DeltaMessage(content=delta_text) - delta_text = delta_text.replace(self.tool_calls_start_token, "").replace( - self.tool_calls_end_token, "" - ) + + # Strip section markers from delta_text for subsequent processing + # NOTE: This preprocessing happens BEFORE the regex-based tool call + # parsing (from PR #24847) to ensure markers are removed cleanly + # before pattern matching. No double-stripping occurs because + # section markers and tool call markers are distinct. + delta_text, _, _ = self._check_and_strip_markers(delta_text) + + # Error recovery: If in tool section for too long, force exit + if self.in_tool_section: + self.section_char_count += len(delta_text) + if self.section_char_count > self.max_section_chars: + logger.warning( + "Tool section exceeded max length (%d chars), forcing exit. " + "This may indicate malformed model output.", + self.max_section_chars, + ) + self._reset_section_state() + # Deferred exit already handled by forced exit above + # Return remaining content as reasoning (or empty delta if no content) + return DeltaMessage(content=delta_text if delta_text.strip() else "") + try: # figure out where we are in the parsing by counting tool call # start & end tags @@ -158,6 +315,16 @@ def extract_tool_calls_streaming( and prev_tool_end_count == cur_tool_end_count and self.tool_call_end_token not in delta_text ): + # CRITICAL FIX: Suppress content if in tool section but + # no tool calls started + if self.in_tool_section and cur_tool_start_count == 0: + logger.debug( + "In tool section but no tool calls started yet. " + "Suppressing: %s", + delta_text, + ) + # Return empty delta to maintain iterator contract + return DeltaMessage(content="") logger.debug("Generating text content! skipping tool parsing.") return DeltaMessage(content=delta_text) @@ -209,6 +376,9 @@ def extract_tool_calls_streaming( ): if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0: logger.debug("attempting to close tool call, but no tool call") + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() return None diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments") if diff: @@ -218,6 +388,9 @@ def extract_tool_calls_streaming( else diff ) if '"}' not in delta_text: + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() return None end_loc = delta_text.rindex('"}') diff = delta_text[:end_loc] + '"}' @@ -227,6 +400,10 @@ def extract_tool_calls_streaming( diff, ) self.streamed_args_for_tool[self.current_tool_id] += diff + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + logger.debug("Completing deferred section exit") + self._reset_section_state() return DeltaMessage( tool_calls=[ DeltaToolCall( @@ -240,9 +417,19 @@ def extract_tool_calls_streaming( # case -- otherwise we're just generating text else: + # Check if we're in tool section - if so, suppress + if self.in_tool_section: + logger.debug("In tool section, suppressing text generation") + # Handle deferred section exit before returning + if deferred_section_exit: + self._reset_section_state() + return DeltaMessage(content="") text = delta_text.replace(self.tool_call_start_token, "") text = text.replace(self.tool_call_end_token, "") delta = DeltaMessage(tool_calls=[], content=text) + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() return delta current_tool_call = dict() @@ -390,6 +577,11 @@ def extract_tool_calls_streaming( else: self.prev_tool_call_arr.append(current_tool_call) + # Handle deferred section exit after tool parsing completes + if deferred_section_exit and self.in_tool_section: + logger.debug("Completing deferred section exit") + self._reset_section_state() + return delta except Exception: From 3380ed5e115613bb0029164754ffea99f328e065 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 17 Nov 2025 14:08:48 +0800 Subject: [PATCH 123/578] [Doc] Add llama4 LoRA tag (#28825) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6eb0947fe568..d47aeaab511b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -684,7 +684,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | | `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I+ | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ | -| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | From 679a7cffdc3baf2a2f205d993a60a8925ebfd358 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 17 Nov 2025 06:29:29 +0000 Subject: [PATCH 124/578] WIP: Integrate Aiter bpreshuffle and ck kernels Signed-off-by: vllmellm --- vllm/_aiter_ops.py | 56 ++++++ .../kernels/scaled_mm/__init__.py | 4 + .../quantization/kernels/scaled_mm/aiter.py | 165 ++++++++++++++++++ 3 files changed, 225 insertions(+) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 5508e59bcd2f..6de21176e948 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -402,6 +402,42 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( return torch.empty_like(x), torch.empty_like(residual) +def _rocm_aiter_gemm_a8w8_bpreshuffle_impl( + input: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype | None = None, + scale_a: torch.Tensor | None = None, + scale_b: torch.Tensor | None = None, +) -> torch.Tensor: + # This AITER function can be used for + # - per-token activations + per-channel weights + # accept the weight as # keep the weight as (N, K) + # NOTE: The weight has to be shuffled in the + # process_weights_after_loading of the CompressedTensorsW8A8Fp8 class + + from aiter import gemm_a8w8_bpreshuffle_ck + + m = input.shape[0] + n = weight.shape[0] + Y = torch.empty(m, n, dtype=out_dtype, device=input.device) + gemm_a8w8_bpreshuffle_ck(input, weight, scale_a, scale_b, Y) + return Y + + +def _rocm_aiter_gemm_a8w8_bpreshuffle_fake( + input: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype | None = None, + scale_a: torch.Tensor | None = None, + scale_b: torch.Tensor | None = None, +) -> torch.Tensor: + m = input.shape[0] + n = weight.shape[0] + if out_dtype is None: + out_dtype = input.dtype + return torch.empty((m, n), dtype=out_dtype, device=input.device) + + # Global flag to ensure ops are registered only once _OPS_REGISTERED = False @@ -592,6 +628,14 @@ def register_ops_once() -> None: dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_gemm_a8w8_bpreshuffle", + op_func=_rocm_aiter_gemm_a8w8_bpreshuffle_impl, + mutates_args=[], + fake_impl=_rocm_aiter_gemm_a8w8_bpreshuffle_fake, + dispatch_key=current_platform.dispatch_key, + ) + _OPS_REGISTERED = True @staticmethod @@ -635,6 +679,18 @@ def gemm_a8w8_blockscale( A, B, As, Bs, output_dtype ) + @staticmethod + def gemm_a8w8_bpreshuffle( + input: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype | None = None, + scale_a: torch.Tensor | None = None, + scale_b: torch.Tensor | None = None, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_gemm_a8w8_bpreshuffle( + input, weight, out_dtype, scale_a, scale_b + ) + @staticmethod def fused_moe( hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 36e4a16c0168..90cbda90adf9 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -8,6 +8,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( + AiterBpreshufflePerTokenFp8ScaledMMLinearKernel, + AiterCKPerTokenFp8ScaledMMLinearKernel, AiterScaledMMLinearKernel, ) from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( @@ -64,6 +66,8 @@ ChannelWiseTorchScaledMMLinearKernel, ], PlatformEnum.ROCM: [ + AiterBpreshufflePerTokenFp8ScaledMMLinearKernel, + AiterCKPerTokenFp8ScaledMMLinearKernel, ROCmScaledMMLinearKernel, PerTensorTorchScaledMMLinearKernel, RowWiseTorchScaledMMLinearKernel, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 4a1c76ffd9b1..28c5640d319a 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -2,17 +2,25 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + import torch +from aiter.ops.shuffle import shuffle_weight from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger from vllm.platforms import current_platform from .cutlass import CutlassScaledMMLinearKernel from .ScaledMMLinearKernel import ( + FP8ScaledMMLinearKernel, + FP8ScaledMMLinearLayerConfig, Int8ScaledMMLinearLayerConfig, ) +logger = init_logger(__name__) + class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): @classmethod @@ -117,3 +125,160 @@ def apply_weights( # b to be [N, K] # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype) + + +class AiterBpreshufflePerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): + def get_ouput_padding(self) -> int | None: + # PTPC kernels do not require padding. + return None + + @classmethod + def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_rocm(): + return (False, "AITER bpreshuffle is ROCm-only") + + if not rocm_aiter_ops.is_linear_enabled(): + return (False, "AITER bpreshuffle is disabled by env var") + + try: + import aiter # noqa: F401 + except Exception: + return (False, "AITER not installed") + + # Check if the configuration is PTPC + is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token() + is_per_token_activation = ( + c.activation_quant_key.scale.group_shape.is_per_token() + ) + is_ptpc = is_per_channel_weight and is_per_token_activation + + logger.info_once(f"AiterBpreshuffle: can_implement called. is_ptpc={is_ptpc}") + + if not is_ptpc: + return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)") + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + logger.info_once("AiterBpreshuffle: SHUFFLING WEIGHTS NOW.") + + w_q, _, _, _ = self._get_layer_params(layer) + + N = w_q.shape[1] + K = w_q.shape[0] + + if N % 16 == 0 and K % 16 == 0: + # AITER shuffle_weight expectation [N, K] + w_q_nk = w_q.t().contiguous() + + # Execute shuffle + shuffled_w_nk = shuffle_weight(w_q_nk, layout=(16, 16)) + + del layer.weight + layer.register_buffer("weight", shuffled_w_nk) + + logger.info_once("[AiterBpreshuffle: Weight shuffle COMPLETE.") + + else: + raise ValueError( + f"Weight shape (N={N}, K={K}) not divisible by 16 " + "for AITER bpreshuffle." + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # 1. Obtain parameters + w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer) + # 2. Dynamic quantization input + qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub) + + logger.info_once( + "AiterBpreshuffle: apply_weights... ABOUT TO CALL C++ KERNEL..." + ) + + output = rocm_aiter_ops.gemm_a8w8_bpreshuffle( + qinput, + w_q, # Input [N, K] shuffle weights + out_dtype=self.config.out_dtype, + scale_a=qinput_scale, + scale_b=w_s, + ) + + logger.info_once("AiterBpreshuffle: C++ KERNEL CALL SUCCEEDED.") + + if bias is not None: + output.add_(bias) + return output + + def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]: + return rocm_aiter_ops.gemm_a8w8_bpreshuffle + + +class AiterCKPerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): + """ + AITER PTPC kernel (gemm_a8w8_CK) without pre-shuffling. + """ + + def get_ouput_padding(self) -> int | None: + return None + + @classmethod + def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_rocm(): + return (False, "AITER CK is ROCm-only") + + if not rocm_aiter_ops.is_linear_enabled(): + return (False, "AITER CK is disabled by env var") + + try: + import aiter # noqa: F401 + except Exception: + return (False, "AITER not installed") + + is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token() + is_per_token_activation = ( + c.activation_quant_key.scale.group_shape.is_per_token() + ) + is_ptpc = is_per_channel_weight and is_per_token_activation + + logger.info_once(f"AiterCK: can_implement called. is_ptpc={is_ptpc}") + + if not is_ptpc: + return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)") + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + logger.info_once( + "AITER CK: process_weights_after_loading... DOING NOTHING (pass)." + ) + pass + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer) + + qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub) + + logger.info_once( + "AiterCK: apply_weights... " + "ABOUT TO CALL C++ KERNEL (this is where it hangs)..." + ) + + output = rocm_aiter_ops.gemm_a8w8( + qinput, w_q.t(), qinput_scale, w_s, bias, self.config.out_dtype + ) + + logger.info_once("AiterCK: C++ KERNEL CALL SUCCEEDED.") + return output + + def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]: + return rocm_aiter_ops.gemm_a8w8 From 577bb34fffc83598d3e4940f8492c122d9e3318d Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Mon, 17 Nov 2025 15:47:24 +0800 Subject: [PATCH 125/578] [CPU][Bugfix] Fix _to_list in CPU model runner (#28824) Signed-off-by: jiang1.li --- csrc/cpu/torch_bindings.cpp | 8 ++++++++ vllm/v1/worker/cpu_model_runner.py | 3 --- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 5e2aa7069256..9fefd88cd9b0 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache( const torch::Tensor& scheduler_metadata, const std::optional& s_aux); +// Note: just for avoiding importing errors +void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); } + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "sliding_window_left, SymInt sliding_window_right, Tensor block_table, " "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()", &cpu_attention_with_kv_cache); + + // placeholders + ops.def("static_scaled_fp8_quant() -> ()", placeholder_op); + ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op); + ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index ceb1cf64b588..40f011fed1ad 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -80,9 +80,6 @@ def _init_device_properties(self) -> None: def _sync_device(self) -> None: pass - def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: - return sampled_token_ids.tolist() - def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]: # Note: For CPU backend, dp padding is not required for now. return 0, None From ab01cd14e5e2ef65549b459d0a2bf3a2540c9f3f Mon Sep 17 00:00:00 2001 From: wuyaoxuehun <798143193@qq.com> Date: Mon, 17 Nov 2025 16:13:11 +0700 Subject: [PATCH 126/578] [BugFix] Fix glm4_moe_mtp load weights bug (#28805) Signed-off-by: wuyaoxuehun <798143193@qq.com> --- vllm/model_executor/models/glm4_moe_mtp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 110ed0a64633..e34ae6c85a4f 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -256,13 +256,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() - spec_layer = self.model.mtp_start_layer_idx for name, loaded_weight in weights: if name == "lm_head.weight": - name = f"model.layers.{spec_layer}.shard_head.head.weight" + spec_layer = self.model.mtp_start_layer_idx + name = f"model.layers.{spec_layer}.shared_head.head.weight" elif name == "model.embed_tokens.weight": - # This name is same with local model, rewriting is not needed. - pass + spec_layer = self.model.mtp_start_layer_idx else: spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) if spec_layer is None: From d4acf518d09515560e1082a80b8a4d6550e20d9b Mon Sep 17 00:00:00 2001 From: Jae-Won Chung Date: Mon, 17 Nov 2025 04:54:15 -0500 Subject: [PATCH 127/578] [Metrics] Fix KV cache usage percent metric multiproc (#28792) The `vllm:kv_cache_usage_perc` Gauge metric is missing `multiprocess_mode="mostrecent"` and ends up returning ``` vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="277"} 0.0 vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="275"} 0.0 vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="273"} 0.6530455880475035 ... ``` The deprecated `vllm:gpu_cache_usage_perc` Gauge metric has `multiprocess_mode="mostrecent"`. Signed-off-by: Jae-Won Chung --- vllm/v1/metrics/loggers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 21280b9c84cf..cb36e7973650 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -494,6 +494,7 @@ def __init__( gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", + multiprocess_mode="mostrecent", labelnames=labelnames, ) self.gauge_kv_cache_usage = make_per_engine( From 1b82fb0ad3cea2e1a31da4fa20dd736a8a181089 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 17 Nov 2025 21:16:44 +0800 Subject: [PATCH 128/578] [XPU] work around for sp, avoid custom op import error (#28822) Signed-off-by: Kunshang Ji --- vllm/compilation/pass_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 0c2210d72ce0..0e8bb2fc9735 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -18,6 +18,7 @@ from .fusion import RMSNormQuantFusionPass from .fusion_attn import AttnFusionPass from .qk_norm_rope_fusion import QKNormRoPEFusionPass + from .sequence_parallelism import SequenceParallelismPass if current_platform.is_cuda(): from .collective_fusion import AllReduceFusionPass, AsyncTPPass @@ -25,7 +26,6 @@ from .fix_functionalization import FixFunctionalizationPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context from .noop_elimination import NoOpEliminationPass -from .sequence_parallelism import SequenceParallelismPass logger = init_logger(__name__) From 64e39d667cb5b550e6ce148acd3d4dcd1654eace Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 17 Nov 2025 09:41:22 -0500 Subject: [PATCH 129/578] [BugFix] Temporary fix for IMA with MTP = 2 and full-cg (#28315) Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 77 +++++++++++++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 16 +++++++ 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 10673041aa68..088d0b1af757 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -18,6 +18,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.math_utils import round_up from vllm.utils.torch_utils import is_torch_equal_or_newer if TYPE_CHECKING: @@ -773,19 +774,8 @@ def post_init_cudagraph_sizes(self) -> None: if self.cudagraph_capture_sizes: assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size - # pre-compute the mapping from batch size to padded graph size - self.bs_to_padded_graph_size = [ - 0 for i in range(self.max_cudagraph_capture_size + 1) - ] - for end, start in zip( - self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1], - [0] + self.cudagraph_capture_sizes, - ): - for bs in range(start, end): - if bs == start: - self.bs_to_padded_graph_size[bs] = start - else: - self.bs_to_padded_graph_size[bs] = end + # May get recomputed in the model runner if adjustment is needed for spec-decode + self.compute_bs_to_padded_graph_size() def set_splitting_ops_for_v1(self): # NOTE: this function needs to be called only when mode is @@ -922,3 +912,64 @@ def custom_op_log_check(self): enable_str, op, ) + + def adjust_cudagraph_sizes_for_spec_decode( + self, uniform_decode_query_len: int, tensor_parallel_size: int + ): + multiple_of = uniform_decode_query_len + if tensor_parallel_size > 1: + multiple_of = max(uniform_decode_query_len, tensor_parallel_size) + if ( + multiple_of % uniform_decode_query_len != 0 + or multiple_of % tensor_parallel_size != 0 + ): + raise ValueError( + f"Can't determine cudagraph shapes that are both a " + f"multiple of {uniform_decode_query_len} " + f"(num_speculative_tokens + 1) required by spec-decode " + f"and {tensor_parallel_size} (tensor_parallel_size) " + f"required by sequence parallelism please adjust " + f"num_speculative_tokens or disable sequence parallelism" + ) + + if not self.cudagraph_capture_sizes or multiple_of <= 1: + return + + assert self.max_cudagraph_capture_size is not None + rounded_sizes = sorted( + set( + round_up(size, multiple_of) + for size in self.cudagraph_capture_sizes + if round_up(size, multiple_of) <= self.max_cudagraph_capture_size + ) + ) + + if len(rounded_sizes) == 0: + logger.warning( + "No valid cudagraph sizes after rounding to multiple of " + " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens" + " or max_cudagraph_capture_size (or cudagraph_capture_sizes)", + multiple_of, + ) + return + + self.max_cudagraph_capture_size = rounded_sizes[-1] + self.cudagraph_capture_sizes = rounded_sizes + + # Recompute after adjusting the cudagraph sizes + self.compute_bs_to_padded_graph_size() + + def compute_bs_to_padded_graph_size(self): + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_graph_size = [ + 0 for i in range(self.max_cudagraph_capture_size + 1) + ] + for end, start in zip( + self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1], + [0] + self.cudagraph_capture_sizes, + ): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_graph_size[bs] = start + else: + self.bs_to_padded_graph_size[bs] = end diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ffbac5fe12f7..4fe1b6487d58 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4332,6 +4332,22 @@ def _check_and_update_cudagraph_mode( "and make sure compilation mode is VLLM_COMPILE" ) + # if we have dedicated decode cudagraphs, and spec-decode is enabled, + # we need to adjust the cudagraph sizes to be a multiple of the uniform + # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207 + # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536 + # Will be removed in the near future when we have seperate cudagraph capture + # sizes for decode and mixed prefill-decode. + if ( + cudagraph_mode.decode_mode() == CUDAGraphMode.FULL + and cudagraph_mode.separate_routine() + and self.uniform_decode_query_len > 1 + ): + self.compilation_config.adjust_cudagraph_sizes_for_spec_decode( + self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size + ) + self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes + # Trigger cudagraph dispatching keys initialization after # resolved cudagraph mode. self.cudagraph_dispatcher.initialize_cudagraph_keys( From 7f064491f80ba20e782f33f4da566ec7da5118d7 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 17 Nov 2025 06:49:25 -0800 Subject: [PATCH 130/578] [Bugfix][Perf] Revert applying HF processor on text-only inputs for multimodal models (#28858) Signed-off-by: Roger Wang --- tests/test_inputs.py | 35 +++++++---------------------------- vllm/inputs/preprocess.py | 14 ++++---------- 2 files changed, 11 insertions(+), 38 deletions(-) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 50a273016ab8..b1fb4e06a690 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): assert zipped["mm_processor_kwargs"] == exp_kwargs -@pytest.mark.parametrize( - "model_id", - [ - "facebook/opt-125m", - ], -) -@pytest.mark.parametrize( - "prompt", - [ - { - "prompt": "", - "multi_modal_data": {"dummy": []}, - }, - { - "prompt_token_ids": [], - "multi_modal_data": {"dummy": []}, - }, - ], -) -def test_preprocessor_text_no_mm_inputs(model_id, prompt): - model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_configs(model_config) - input_preprocessor = InputPreprocessor(model_config, tokenizer) - - with pytest.raises(ValueError, match="does not support multimodal inputs"): - input_preprocessor.preprocess(prompt) - - @pytest.mark.parametrize( "model_id", [ @@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt): {"prompt_token_ids": []}, ], ) +@pytest.mark.skip( + reason=( + "Applying huggingface processor on text inputs results in " + "significant performance regression for multimodal models. " + "See https://github.com/vllm-project/vllm/issues/26320" + ) +) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) tokenizer = init_tokenizer_from_configs(model_config) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 80d5322a34c3..839c13868a16 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -348,18 +348,15 @@ def _process_tokens( ) inputs: TokenInputs | MultiModalInputs - if self.model_config.is_multimodal_model: + if multi_modal_data := parsed_content.get("multi_modal_data"): inputs = self._process_multimodal( prompt_token_ids, - parsed_content.get("multi_modal_data") or {}, + multi_modal_data, parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: - if parsed_content.get("multi_modal_data"): - raise ValueError("This model does not support multimodal inputs") - inputs = token_inputs(prompt_token_ids) if cache_salt := parsed_content.get("cache_salt"): @@ -377,18 +374,15 @@ def _process_text( prompt_text = parsed_content["prompt"] inputs: TokenInputs | MultiModalInputs - if self.model_config.is_multimodal_model: + if multi_modal_data := parsed_content.get("multi_modal_data"): inputs = self._process_multimodal( prompt_text, - parsed_content.get("multi_modal_data") or {}, + multi_modal_data, parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: - if parsed_content.get("multi_modal_data"): - raise ValueError("This model does not support multimodal inputs") - prompt_token_ids = self._tokenize_prompt( prompt_text, tokenization_kwargs=tokenization_kwargs, From e42bd8c2e3bfecdaf9c5a7ad99d7c7d7cb75a7b5 Mon Sep 17 00:00:00 2001 From: tiehexue Date: Tue, 18 Nov 2025 00:02:32 +0800 Subject: [PATCH 131/578] Cast return value to int64_t for cache size (#28814) Signed-off-by: tiehexue --- csrc/cpu/cpu_attn_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 5de8a114b2b5..344296528b65 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -754,7 +754,7 @@ class AttentionScheduler { return l2_cache_size >> 1; // use 50% of L2 cache } // Fallback if sysctlbyname fails - return 128 * 1024 >> 1; // use 50% of 128KB + return 128LL * 1024 >> 1; // use 50% of 128KB #else long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); TORCH_CHECK_NE(l2_cache_size, -1); From f8b19c0ffd65f7f6f01a0da4a39b6890f5db40cb Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Mon, 17 Nov 2025 10:15:26 -0800 Subject: [PATCH 132/578] [Bugfix] Fix GPT-OSS on AMD after #28603 (#28816) Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 9 +++++---- vllm/model_executor/layers/quantization/mxfp4.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e232000511c3..2471b509a9ff 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1068,7 +1068,7 @@ steps: # this runner has 2 GPUs available even though num_gpus=2 is not set - pytest -v -s tests/compile/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # Wrap with quotes to escape yaml + # Wrap with quotes to escape yaml - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" - label: Blackwell Fusion E2E Tests # 30 min @@ -1095,10 +1095,11 @@ steps: # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile -- label: Blackwell GPT-OSS Eval +- label: ROCm GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - gpu: b200 + agent_pool: mi325_1 + mirror_hardwares: [amdproduction] optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1107,7 +1108,7 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5552c1ae5edf..b95d1a6b3a1f 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -755,8 +755,8 @@ def _interleave_mxfp4_cutlass_sm90(w): self.w13_weight = w13_weight self.w2_weight = w2_weight - layer.w13_weight = Parameter(w13_weight.data, requires_grad=False) - layer.w2_weight = Parameter(w2_weight.data, requires_grad=False) + layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False) + layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False) else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") From d8874c61a55e40db4ada047f1736c38c86439fff Mon Sep 17 00:00:00 2001 From: Ronald Date: Tue, 18 Nov 2025 04:16:20 +0800 Subject: [PATCH 133/578] [Core] Async Scheduling X Spec Decoding Compatibility (#24799) Signed-off-by: Ronald1995 Signed-off-by: Nick Hill Signed-off-by: Benjamin Chislett Co-authored-by: Nick Hill Co-authored-by: Benjamin Chislett --- tests/v1/e2e/test_async_scheduling.py | 38 +-- vllm/config/speculative.py | 38 ++- vllm/config/vllm.py | 21 +- vllm/v1/core/sched/async_scheduler.py | 15 +- vllm/v1/core/sched/scheduler.py | 12 +- vllm/v1/engine/core.py | 6 +- vllm/v1/engine/processor.py | 17 ++ vllm/v1/sample/logits_processor/__init__.py | 2 +- vllm/v1/spec_decode/eagle.py | 7 +- vllm/v1/worker/gpu_input_batch.py | 3 + vllm/v1/worker/gpu_model_runner.py | 253 +++++++++++++++++--- 11 files changed, 314 insertions(+), 98 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index c4aca82416cd..f732b05f09f9 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -15,7 +15,7 @@ from ...models.utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" -MTP_MODEL = "XiaomiMiMo/MiMo-7B-Base" +MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct" first_prompt = ( @@ -29,7 +29,8 @@ default_params = dict( temperature=0.0, # greedy - max_tokens=20, + max_tokens=23, + min_tokens=18, ) @@ -69,15 +70,9 @@ def test_without_spec_decoding( (True, "uni", True, None, True), ] - run_tests( - monkeypatch, - MODEL, - test_configs, - test_sampling_params, - ) + run_tests(monkeypatch, MODEL, test_configs, test_sampling_params) -@pytest.mark.skip("MTP model too big to run in fp32 in CI") def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): """Test consistency and acceptance rates with some different combos of preemption, executor, async scheduling, prefill chunking, @@ -85,8 +80,9 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): """ spec_config = { - "method": "mtp", + "method": "eagle3", "num_speculative_tokens": 2, + "model": "nm-testing/Llama3_2_1B_speculator.eagle3", } spec_config_short = spec_config | {"max_model_len": 50} @@ -106,12 +102,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): (True, "uni", True, spec_config_short, True), ] - run_tests( - monkeypatch, - MTP_MODEL, - test_configs, - [{}], - ) + run_tests(monkeypatch, MTP_MODEL, test_configs, [{}]) @dynamo_config.patch(cache_size_limit=16) @@ -182,15 +173,13 @@ def run_tests( and test_acceptance_rate is not None ): if "spec_mml=None" in test_config: - # because the acceptance rate can vary, we use a looser - # tolerance here. assert ( pytest.approx(test_acceptance_rate, rel=5e-2) == base_acceptance_rate ) else: # Currently the reported acceptance rate is expected to be - # lower when we skip drafting altogether. + # lower when we sometimes skip drafting altogether. assert test_acceptance_rate > 0.05 print( f"PASSED: config=[{test_config}], params={params}" @@ -220,6 +209,7 @@ def run_test( ): spec_decoding = spec_config is not None cache_arg: dict[str, Any] = ( + # Force preemptions dict(num_gpu_blocks_override=32) if test_preemption else dict(gpu_memory_utilization=0.9) @@ -238,6 +228,7 @@ def run_test( model, max_model_len=512, enable_chunked_prefill=test_prefill_chunking, + # Force prefill chunking max_num_batched_tokens=48 if test_prefill_chunking else None, # enforce_eager=True, async_scheduling=async_scheduling, @@ -255,10 +246,7 @@ def run_test( results.append( vllm_model.generate( example_prompts, - sampling_params=SamplingParams( - **default_params, - **override_params, - ), + sampling_params=SamplingParams(**default_params, **override_params), return_logprobs=True, ) ) @@ -270,9 +258,7 @@ def run_test( if test_preemption: preemptions = _get_count( - metrics_before, - metrics_after, - "vllm:num_preemptions", + metrics_before, metrics_after, "vllm:num_preemptions" ) assert preemptions > 0, "preemption test had no preemptions" diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 31cdeabe501d..13a8632413d9 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -3,7 +3,7 @@ import ast import hashlib -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, get_args from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -29,31 +29,25 @@ logger = init_logger(__name__) -SpeculativeMethod = Literal[ - "ngram", - "eagle", - "eagle3", - "medusa", - "mlp_speculator", - "draft_model", - "deepseek_mtp", - "ernie_mtp", - "qwen3_next_mtp", - "mimo_mtp", - "longcat_flash_mtp", - "pangu_ultra_moe_mtp", - "mtp", - "suffix", -] -MTP_MODEL_TYPES = ( +MTPModelTypes = Literal[ "deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp", "qwen3_next_mtp", "longcat_flash_mtp", + "mtp", "pangu_ultra_moe_mtp", -) +] +EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes] +SpeculativeMethod = Literal[ + "ngram", + "medusa", + "mlp_speculator", + "draft_model", + "suffix", + EagleModelTypes, +] @config @@ -244,7 +238,7 @@ def __post_init__(self): # can not be detected, it will be considered as the "draft_model" by # default. - if self.method in MTP_MODEL_TYPES: + if self.method in get_args(MTPModelTypes) and self.method != "mtp": logger.warning( "method `%s` is deprecated and replaced with mtp.", self.method ) @@ -361,7 +355,9 @@ def __post_init__(self): self.method = "medusa" elif self.draft_model_config.hf_config.model_type == "mlp_speculator": self.method = "mlp_speculator" - elif self.draft_model_config.hf_config.model_type in MTP_MODEL_TYPES: + elif self.draft_model_config.hf_config.model_type in get_args( + MTPModelTypes + ): self.method = "mtp" if self.num_speculative_tokens > 1: logger.warning( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index bd98be48588f..672b004c4aa5 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -14,13 +14,14 @@ from datetime import datetime from functools import lru_cache from pathlib import Path -from typing import TYPE_CHECKING, Any, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar, get_args import torch from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass import vllm.envs as envs +from vllm.config.speculative import EagleModelTypes from vllm.logger import enable_trace_function_call, init_logger from vllm.transformers_utils.runai_utils import is_runai_obj_uri from vllm.utils import random_uuid @@ -374,10 +375,22 @@ def __post_init__(self): "Async scheduling is not yet compatible with " "pipeline_parallel_size > 1." ) + # Currently, async scheduling only support eagle speculative + # decoding. if self.speculative_config is not None: - raise ValueError( - "Async scheduling is not yet compatible with speculative decoding." - ) + if self.speculative_config.method not in get_args(EagleModelTypes): + raise ValueError( + "Currently, async scheduling is only supported " + "with EAGLE/MTP kind of speculative decoding" + ) + if self.speculative_config.disable_padded_drafter_batch: + raise ValueError( + "async scheduling for EAGLE/MTP kind of speculative " + "decoding is enabled, but disable_padded_drafter_batch=True " + "disable_padded_drafter_batch=True is not supported for " + "this situation now. please set " + "disable_padded_drafter_batch=Fasle" + ) if not executor_supports_async_sched: raise ValueError( "Currently, async scheduling only supports `mp`, `uni`, or " diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py index 0ad994c360b0..3214f65a0972 100644 --- a/vllm/v1/core/sched/async_scheduler.py +++ b/vllm/v1/core/sched/async_scheduler.py @@ -16,18 +16,25 @@ def _update_after_schedule( ) -> None: super()._update_after_schedule(scheduler_output) pending_structured_output_tokens = False + spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens for req_id in scheduler_output.num_scheduled_tokens: request = self.requests[req_id] pending_structured_output_tokens |= ( request.use_structured_output and request.num_output_placeholders > 0 ) + cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ())) if ( request.num_computed_tokens - == request.num_tokens + request.num_output_placeholders + == request.num_tokens + + request.num_output_placeholders + + cur_num_spec_tokens ): - # The request will generate a new token in this scheduling step. - # TODO(woosuk): Support speculative decoding. - request.num_output_placeholders += 1 + # The request will generate a new token plus num_spec_tokens + # in this scheduling step. + request.num_output_placeholders += 1 + cur_num_spec_tokens + # Add placeholders for the new tokens in spec_token_ids. + # Wwe will update the actual spec token ids in the worker process. + request.spec_token_ids = [-1] * self.num_spec_tokens scheduler_output.pending_structured_output_tokens = ( pending_structured_output_tokens diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8e62542337a7..61640e856ac1 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -348,7 +348,10 @@ def schedule(self) -> SchedulerOutput: # Speculative decode related. if request.spec_token_ids: num_scheduled_spec_tokens = ( - num_new_tokens + request.num_computed_tokens - request.num_tokens + num_new_tokens + + request.num_computed_tokens + - request.num_tokens + - request.num_output_placeholders ) if num_scheduled_spec_tokens > 0: # Trim spec_token_ids list to num_scheduled_spec_tokens. @@ -1024,7 +1027,12 @@ def update_from_output( # tokens and rejections. If some tokens are rejected, # num_computed_tokens is decreased by the number of rejected # tokens. - request.num_computed_tokens -= num_rejected + if request.num_computed_tokens > 0: + request.num_computed_tokens -= num_rejected + # If async scheduling, num_output_placeholders also includes + # the scheduled spec tokens count and so is similarly adjusted. + if request.num_output_placeholders > 0: + request.num_output_placeholders -= num_rejected spec_decoding_stats = self.make_spec_decoding_stats( spec_decoding_stats, num_draft_tokens=num_draft_tokens, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a6965182fc2c..508669cf527d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -198,6 +198,7 @@ def __init__( self.step_fn = ( self.step if self.batch_queue is None else self.step_with_batch_queue ) + self.async_scheduling = vllm_config.scheduler_config.async_scheduling # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. @@ -341,7 +342,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0 def post_step(self, model_executed: bool) -> None: - if self.use_spec_decode and model_executed: + # When using async scheduling we can't get draft token ids in advance, + # so we update draft token ids in the worker process and don't + # need to update draft token ids here. + if not self.async_scheduling and self.use_spec_decode and model_executed: # Take the draft token ids. draft_token_ids = self.model_executor.take_draft_token_ids() if draft_token_ids is not None: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index fffd075a5165..4cb911d8e22b 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -150,6 +150,23 @@ def _validate_supported_sampling_params( raise ValueError( "vLLM V1 does not support per request user provided logits processors." ) + # Async scheduling + spec decode currently incompatible with some + # sampling parameters. + if ( + self.vllm_config.speculative_config is not None + and self.vllm_config.scheduler_config.async_scheduling + and ( + params.frequency_penalty != 0.0 + or params.presence_penalty != 0.0 + or params.repetition_penalty != 1.0 + or params.bad_words_token_ids + or params.structured_outputs + ) + ): + raise ValueError( + "async scheduling with spec decoding doesn't yet support " + "penalties, bad words or structured outputs in sampling parameters." + ) def _validate_params( self, diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index 5992c4066c9c..8b174af4c779 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -41,7 +41,7 @@ # Error message when the user tries to initialize vLLM with a speculative # decoding enabled and custom logitsproces STR_SPEC_DEC_REJECTS_LOGITSPROCS = ( - "Custom logits processors are not supportedwhen speculative decoding is enabled." + "Custom logits processors are not supported when speculative decoding is enabled." ) LOGITSPROCS_GROUP = "vllm.logits_processors" diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index ed602f39d0f9..5bf2503c3027 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -397,10 +397,13 @@ def propose( positions += 1 exceeds_max_model_len = positions >= self.max_model_len clamped_positions = torch.where(exceeds_max_model_len, 0, positions) - + # For data integrity when async scheduling, we shouldn't use in place + # operations in case they are modified in next step's `prepare_input` + # of main model. # Increment the sequence lengths. common_attn_metadata.seq_lens += 1 - common_attn_metadata.seq_lens_cpu += 1 + # This is an out-of-place operation to avoid modifying the original tensor. + common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1 # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 393181f543d2..7cf6afa3fc37 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -46,6 +46,9 @@ class CachedRequestState: lora_request: LoRARequest | None = None prompt_embeds: torch.Tensor | None = None + # Used when both async_scheduling and spec_decode are enabled. + prev_num_draft_len: int = 0 + def __post_init__(self): self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds( self.prompt_token_ids, self.prompt_embeds diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4fe1b6487d58..758e3e1b3a82 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -179,6 +179,7 @@ def __init__( logprobs_tensors: torch.Tensor | None, invalid_req_indices: list[int], async_output_copy_stream: torch.cuda.Stream, + vocab_size: int, ): self._model_runner_output = model_runner_output self._invalid_req_indices = invalid_req_indices @@ -189,6 +190,7 @@ def __init__( # Keep a reference to the device tensor to avoid it being # deallocated until we finish copying it to the host. self._sampled_token_ids = sampled_token_ids + self.vocab_size = vocab_size self._logprobs_tensors = logprobs_tensors # Initiate the copy on a separate stream, but do not synchronize it. @@ -215,10 +217,16 @@ def get_output(self) -> ModelRunnerOutput: # Release the device tensors once the copy has completed. del self._logprobs_tensors del self._sampled_token_ids - - valid_sampled_token_ids: list[np.ndarray] = [ - row for row in self.sampled_token_ids_cpu.numpy() - ] + max_gen_len = self.sampled_token_ids_cpu.shape[-1] + if max_gen_len == 1: + valid_sampled_token_ids: list[np.ndarray] = [ + row for row in self.sampled_token_ids_cpu.numpy() + ] + else: + valid_sampled_token_ids = RejectionSampler.parse_output( + self.sampled_token_ids_cpu, + self.vocab_size, + ) for i in self._invalid_req_indices: valid_sampled_token_ids[i] = np.array([]) @@ -377,6 +385,10 @@ def __init__( ) self.rejection_sampler = RejectionSampler(self.sampler) + self.num_spec_tokens = 0 + if self.speculative_config: + self.num_spec_tokens = self.speculative_config.num_speculative_tokens + # Request states. self.requests: dict[str, CachedRequestState] = {} self.comm_stream = torch.cuda.Stream() @@ -513,11 +525,7 @@ def __init__( self.max_num_tokens, dtype=torch.int32, device=self.device ) - self.uniform_decode_query_len = ( - 1 - if not self.speculative_config - else 1 + self.speculative_config.num_speculative_tokens - ) + self.uniform_decode_query_len = 1 + self.num_spec_tokens # Cudagraph dispatcher for runtime cudagraph dispatching. self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) @@ -549,6 +557,20 @@ def __init__( pin_memory=self.pin_memory, ) + # Pre-allocated tensor for copying valid sampled token counts to CPU, + # with dedicated stream for overlapping and event for coordination. + self.valid_sampled_token_count_event: torch.cuda.Event | None = None + self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None + if self.use_async_scheduling and self.num_spec_tokens: + self.valid_sampled_token_count_event = torch.cuda.Event() + self.valid_sampled_token_count_copy_stream = torch.cuda.Stream() + self.valid_sampled_token_count_cpu = torch.empty( + self.max_num_reqs, + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory, + ) + # Ephemeral state transferred between execute_model() and sample_tokens(). self.execute_model_state: ExecuteModelState | None = None @@ -736,17 +758,45 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the states of the running/resumed requests. is_last_rank = get_pp_group().is_last_rank req_data = scheduler_output.scheduled_cached_reqs + + # Wait until valid_sampled_tokens_count is copied to cpu, + # then use it to update actual num_computed_tokens of each request. + valid_sampled_token_count = self._get_valid_sampled_token_count() + for i, req_id in enumerate(req_data.req_ids): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] resumed_from_preemption = req_id in req_data.resumed_req_ids num_output_tokens = req_data.num_output_tokens[i] + req_index = self.input_batch.req_id_to_index.get(req_id) - # Update the cached states. + # prev_num_draft_len is used in async scheduling mode with + # spec decode. it indicates if need to update num_computed_tokens + # of the request. for example: + # fist step: num_computed_tokens = 0, spec_tokens = [], + # prev_num_draft_len = 0. + # second step: num_computed_tokens = 100(prompt lenth), + # spec_tokens = [a,b], prev_num_draft_len = 0. + # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], + # prev_num_draft_len = 2. + # num_computed_tokens in first step and second step does't contain + # the spec tokens length, but in third step it contains the + # spec tokens length. we only need to update num_computed_tokens + # when prev_num_draft_len > 0. + if req_state.prev_num_draft_len: + if req_index is None: + req_state.prev_num_draft_len = 0 + else: + assert self.input_batch.prev_req_id_to_index is not None + prev_req_index = self.input_batch.prev_req_id_to_index[req_id] + num_accepted = valid_sampled_token_count[prev_req_index] - 1 + num_rejected = req_state.prev_num_draft_len - num_accepted + num_computed_tokens -= num_rejected + req_state.output_token_ids.extend([-1] * num_accepted) + # Update the cached states. req_state.num_computed_tokens = num_computed_tokens - req_index = self.input_batch.req_id_to_index.get(req_id) if not is_last_rank: # When using PP, the scheduler sends the sampled tokens back, @@ -823,8 +873,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( req_id, [] ) - if spec_token_ids: - num_spec_tokens = len(spec_token_ids) + num_spec_tokens = len(spec_token_ids) + # For async scheduling, token_ids_cpu assigned from + # spec_token_ids are placeholders and will be overwritten in + # _prepare_input_ids. + if num_spec_tokens: start_index = self.input_batch.num_tokens_no_spec[req_index] end_token_index = start_index + num_spec_tokens self.input_batch.token_ids_cpu[ @@ -840,6 +893,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # even when speculative decoding is enabled. self.input_batch.spec_token_ids[req_index] = spec_token_ids + # there are no draft tokens with async scheduling, + # we clear the spec_decoding info in scheduler_output and + # use normal sampling but rejection_sampling. + if self.use_async_scheduling: + req_state.prev_num_draft_len = num_spec_tokens + if num_spec_tokens and self._draft_token_ids is None: + scheduler_output.total_num_scheduled_tokens -= num_spec_tokens + scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens + scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. for request in reqs_to_add: @@ -959,7 +1021,10 @@ def _get_cumsum_and_arange( return cu_num_tokens, arange def _prepare_input_ids( - self, total_num_scheduled_tokens: int, cu_num_tokens: np.ndarray + self, + scheduler_output: "SchedulerOutput", + total_num_scheduled_tokens: int, + cu_num_tokens: np.ndarray, ) -> None: """Prepare the input IDs for the current batch. @@ -980,21 +1045,43 @@ def _prepare_input_ids( # on the GPU from prev_sampled_token_ids. prev_req_id_to_index = self.input_batch.prev_req_id_to_index assert prev_req_id_to_index is not None - flattened_indices = [] - prev_common_req_indices = [] + sample_flattened_indices: list[int] = [] + spec_flattened_indices: list[int] = [] + prev_common_req_indices: list[int] = [] + prev_draft_token_indices: list[int] = [] indices_match = True max_flattened_index = -1 + total_num_spec_tokens = 0 + scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens + for req_id, cur_index in self.input_batch.req_id_to_index.items(): if (prev_index := prev_req_id_to_index.get(req_id)) is not None: prev_common_req_indices.append(prev_index) # We need to compute the flattened input_ids index of the # last token in each common request. + draft_len = len(scheduled_spec_tokens.get(req_id, ())) + total_num_spec_tokens += draft_len flattened_index = cu_num_tokens[cur_index].item() - 1 - flattened_indices.append(flattened_index) + # example: cu_num_tokens = [2, 5, 8], draft_tokens = [1, 2, 2] + # sample_flattened_indices = [0, 2, 5] + # spec_flattened_indices = [1, 3, 4, 6, 7] + sample_flattened_indices.append(flattened_index - draft_len) + spec_flattened_indices.extend( + range(flattened_index - draft_len + 1, flattened_index + 1) + ) + start = prev_index * self.num_spec_tokens + # prev_draft_token_indices is used to find which draft_tokens_id + # should be copied to input_ids + # example: prev draft_tokens_id [[1,2], [3,4], [5, 6]] + # flatten draft_tokens_id [1,2,3,4,5,6] + # draft_len of each request [1, 2, 1] + # then prev_draft_token_indices is [0, 2, 3, 4] + prev_draft_token_indices.extend(range(start, start + draft_len)) indices_match &= prev_index == flattened_index max_flattened_index = max(max_flattened_index, flattened_index) - num_commmon_tokens = len(flattened_indices) - if num_commmon_tokens < total_num_scheduled_tokens: + num_commmon_tokens = len(sample_flattened_indices) + total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens + if num_commmon_tokens < total_without_spec: # If not all requests are decodes from the last iteration, # We need to copy the input_ids_cpu to the GPU first. self.input_ids.copy_to_gpu(total_num_scheduled_tokens) @@ -1018,20 +1105,43 @@ def _prepare_input_ids( self.is_token_ids.gpu[:num_commmon_tokens] = True return # Upload the index tensors asynchronously so the scatter can be non-blocking. - input_ids_index_tensor = torch.tensor( - flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory + sampled_tokens_index_tensor = torch.tensor( + sample_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory ).to(self.device, non_blocking=True) prev_common_req_indices_tensor = torch.tensor( prev_common_req_indices, dtype=torch.int64, pin_memory=self.pin_memory ).to(self.device, non_blocking=True) self.input_ids.gpu.scatter_( dim=0, - index=input_ids_index_tensor, + index=sampled_tokens_index_tensor, src=self.input_batch.prev_sampled_token_ids[ prev_common_req_indices_tensor, 0 ], ) + # Scatter the draft tokens after the sampled tokens are scattered. + if self._draft_token_ids is None or not spec_flattened_indices: + return + + assert isinstance(self._draft_token_ids, torch.Tensor) + draft_tokens_index_tensor = torch.tensor( + spec_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory + ).to(self.device, non_blocking=True) + prev_draft_token_indices_tensor = torch.tensor( + prev_draft_token_indices, dtype=torch.int64, pin_memory=self.pin_memory + ).to(self.device, non_blocking=True) + + # because input_ids dtype is torch.int32, + # so convert draft_token_ids to torch.int32 here. + draft_token_ids = self._draft_token_ids.to(dtype=torch.int32) + self._draft_token_ids = None + + self.input_ids.gpu.scatter_( + dim=0, + index=draft_tokens_index_tensor, + src=draft_token_ids.flatten()[prev_draft_token_indices_tensor], + ) + def _get_encoder_seq_lens( self, scheduled_encoder_inputs: dict[str, list[int]], @@ -1218,7 +1328,11 @@ def _prepare_inputs( self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) # Copy the tensors to the GPU. - self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens) + self._prepare_input_ids( + scheduler_output, + total_num_scheduled_tokens, + cu_num_tokens, + ) if self.uses_mrope: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) @@ -2377,12 +2491,14 @@ def _bookkeeping_sync( valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() invalid_req_indices_set = set(invalid_req_indices) - assert sampled_token_ids.shape[-1] == 1 # Cache the sampled tokens on the GPU and avoid CPU sync. # These will be copied into input_ids in the next step # when preparing inputs. - self.input_batch.prev_sampled_token_ids = sampled_token_ids + # With spec decoding, this is done in propose_draft_token_ids(). + if self.input_batch.prev_sampled_token_ids is None: + assert sampled_token_ids.shape[-1] == 1 + self.input_batch.prev_sampled_token_ids = sampled_token_ids self.input_batch.prev_req_id_to_index = { req_id: i for i, req_id in enumerate(self.input_batch.req_ids) @@ -2517,6 +2633,21 @@ def execute_model( "State error: sample_tokens() must be called " "after execute_model() returns None." ) + + # self._draft_token_ids is None when `input_fits_in_drafter=False` + # and there is no draft tokens scheduled. so it need to update the + # spec_decoding info in scheduler_output with async_scheduling. + # use deepcopy to avoid the modification has influence on the + # scheduler_output in engine core process. + # TODO(Ronald1995): deepcopy is expensive when there is a large + # number of requests, optimize it later. + if ( + self.use_async_scheduling + and self.num_spec_tokens + and self._draft_token_ids is None + ): + scheduler_output = deepcopy(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with record_function_or_nullcontext("gpu_model_runner: preprocess"): with self.synchronize_input_prep(): @@ -2759,6 +2890,8 @@ def sample_tokens( with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) + self.input_batch.prev_sampled_token_ids = None + def propose_draft_token_ids( sampled_token_ids: torch.Tensor | list[np.ndarray], ) -> None: @@ -2792,14 +2925,29 @@ def propose_draft_token_ids( self.speculative_config.draft_model_config.max_model_len ) input_fits_in_drafter = spec_decode_common_attn_metadata and ( - spec_decode_common_attn_metadata.max_seq_len - + self.speculative_config.num_speculative_tokens + spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len ) - if use_padded_batch_for_eagle and input_fits_in_drafter: - # EAGLE speculative decoding can use the GPU sampled tokens - # as inputs, and does not need to wait for bookkeeping to finish. - propose_draft_token_ids(sampler_output.sampled_token_ids) + if use_padded_batch_for_eagle: + sampled_token_ids = sampler_output.sampled_token_ids + if input_fits_in_drafter: + # EAGLE speculative decoding can use the GPU sampled tokens + # as inputs, and does not need to wait for bookkeeping to finish. + propose_draft_token_ids(sampled_token_ids) + elif self.valid_sampled_token_count_event is not None: + next_token_ids, valid_sampled_tokens_count = ( + self.drafter.prepare_next_token_ids_padded( + spec_decode_common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_indices.gpu, + self.num_discarded_requests, + ) + ) + self._copy_valid_sampled_token_count( + next_token_ids, valid_sampled_tokens_count + ) with record_function_or_nullcontext("gpu_model_runner: bookkeep"): ( @@ -2856,6 +3004,7 @@ def propose_draft_token_ids( logprobs_tensors=sampler_output.logprobs_tensors, invalid_req_indices=invalid_req_indices, async_output_copy_stream=self.async_output_copy_stream, + vocab_size=self.input_batch.vocab_size, ) with record_function_or_nullcontext( "gpu_model_runner: set_async_sampled_token_ids" @@ -2880,6 +3029,37 @@ def take_draft_token_ids(self) -> DraftTokenIds | None: self._draft_token_ids = None return DraftTokenIds(req_ids, draft_token_ids) + def _copy_valid_sampled_token_count( + self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor + ) -> None: + if self.valid_sampled_token_count_event is None: + return + + default_stream = torch.cuda.current_stream() + # Initialize a new stream to overlap the copy operation with + # prepare_input of draft model. + with torch.cuda.stream(self.valid_sampled_token_count_copy_stream): + self.valid_sampled_token_count_copy_stream.wait_stream(default_stream) # type: ignore + counts = valid_sampled_tokens_count + counts_cpu = self.valid_sampled_token_count_cpu + counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True) + self.valid_sampled_token_count_event.record() + + self.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1) + + def _get_valid_sampled_token_count(self) -> list[int]: + # Wait until valid_sampled_tokens_count is copied to cpu, + prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids + if ( + self.valid_sampled_token_count_event is None + or prev_sampled_token_ids is None + ): + return [] + + counts_cpu = self.valid_sampled_token_count_cpu + self.valid_sampled_token_count_event.synchronize() + return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist() + def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", @@ -2967,6 +3147,9 @@ def propose_draft_token_ids( self.num_discarded_requests, ) ) + self._copy_valid_sampled_token_count( + next_token_ids, valid_sampled_tokens_count + ) if spec_decode_metadata is None: token_indices_to_sample = None @@ -3532,7 +3715,7 @@ def _dummy_run( # TODO(luka) better system for describing dummy batches seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] else: - seq_lens = max_query_len + seq_lens = max_query_len # type: ignore[assignment] self.seq_lens.np[:num_reqs] = seq_lens self.seq_lens.np[num_reqs:] = 0 self.seq_lens.copy_to_gpu() @@ -4485,11 +4668,7 @@ def may_reinitialize_input_batch( logitsprocs=self.input_batch.logitsprocs, logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids, is_pooling_model=self.is_pooling_model, - num_speculative_tokens=( - self.vllm_config.speculative_config.num_speculative_tokens - if self.vllm_config.speculative_config - else 0 - ), + num_speculative_tokens=self.num_spec_tokens, ) def _allocate_kv_cache_tensors( From 7765e5ba75c0b5caa8f372bfa20ab3de2c6b3aac Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 17 Nov 2025 14:08:50 -0800 Subject: [PATCH 134/578] [BugFix] Fix PP performance and PP kv connector output regression (#28768) Signed-off-by: Nick Hill --- vllm/v1/engine/core.py | 150 +++++++++++++---------------- vllm/v1/executor/ray_executor.py | 21 +++- vllm/v1/worker/gpu_model_runner.py | 23 ++++- vllm/v1/worker/gpu_worker.py | 15 +-- 4 files changed, 105 insertions(+), 104 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 508669cf527d..97286c6e2e5e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -63,7 +63,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.structured_output import StructuredOutputManager -from vllm.v1.utils import record_function_or_nullcontext from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -181,11 +180,13 @@ def __init__( logger.info("Batch queue is enabled with size %d", self.batch_queue_size) self.batch_queue = deque(maxlen=self.batch_queue_size) + self.ec_producer = ( + vllm_config.ec_transfer_config is not None + and vllm_config.ec_transfer_config.is_ec_producer + ) + self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None - if ( - self.vllm_config.cache_config.enable_prefix_caching - or kv_connector is not None - ): + if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None: caching_hash_fn = get_hash_fn_by_name( vllm_config.cache_config.prefix_caching_hash_algo ) @@ -246,7 +247,7 @@ def _initialize_kv_caches( elapsed = time.time() - start logger.info_once( - ("init engine (profile, create kv cache, warmup model) took %.2f seconds"), + "init engine (profile, create kv cache, warmup model) took %.2f seconds", elapsed, scope="local", ) @@ -312,6 +313,16 @@ def log_error_detail(self, scheduler_output: SchedulerOutput): ) raise err + def _log_err_callback(self, scheduler_output: SchedulerOutput): + """Log error details of a future that's not expected to return a result.""" + + def callback(f, sched_output=scheduler_output): + with self.log_error_detail(sched_output): + result = f.result() + assert result is None + + return callback + def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: """Schedule, execute, and make output. @@ -323,21 +334,17 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: # or finished and not yet removed from the batch. if not self.scheduler.has_requests(): return {}, False - with record_function_or_nullcontext("core step: schedule"): - scheduler_output = self.scheduler.schedule() - - with record_function_or_nullcontext("core step: execute_model"): - future = self.model_executor.execute_model(scheduler_output, non_block=True) - grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output) - with self.log_error_detail(scheduler_output): - model_output = future.result() - if model_output is None: - model_output = self.model_executor.sample_tokens(grammar_output) - - with record_function_or_nullcontext("core step: update_from_output"): - engine_core_outputs = self.scheduler.update_from_output( - scheduler_output, model_output - ) + scheduler_output = self.scheduler.schedule() + future = self.model_executor.execute_model(scheduler_output, non_block=True) + grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output) + with self.log_error_detail(scheduler_output): + model_output = future.result() + if model_output is None: + model_output = self.model_executor.sample_tokens(grammar_output) + + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, model_output + ) return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0 @@ -378,52 +385,34 @@ def step_with_batch_queue( model_executed = False deferred_scheduler_output = None if self.scheduler.has_requests(): - with record_function_or_nullcontext("core step_with_batch_queue: schedule"): - scheduler_output = self.scheduler.schedule() - with record_function_or_nullcontext( - "core step_with_batch_queue: execute_model" - ): - exec_future = self.model_executor.execute_model( - scheduler_output, non_block=True - ) - model_executed = scheduler_output.total_num_scheduled_tokens > 0 + scheduler_output = self.scheduler.schedule() + exec_future = self.model_executor.execute_model( + scheduler_output, non_block=True + ) + if not self.ec_producer: + model_executed = scheduler_output.total_num_scheduled_tokens > 0 - if scheduler_output.pending_structured_output_tokens: - with record_function_or_nullcontext( - "core step_with_batch_queue: pending_structured_output_tokens" - ): - # We need to defer sampling until we have processed the model output - # from the prior step. - deferred_scheduler_output = scheduler_output - # Block-wait for execute to return - # (continues running async on the GPU). - with self.log_error_detail(scheduler_output): - exec_result = exec_future.result() - assert exec_result is None + if not model_executed: + # No sampling required (no requests scheduled). + future = cast(Future[ModelRunnerOutput], exec_future) else: - with record_function_or_nullcontext( - "core step_with_batch_queue: get_grammar_bitmask" - ): - # We aren't waiting for any tokens, get any grammar - # output immediately. + exec_future.add_done_callback(self._log_err_callback(scheduler_output)) + + if not scheduler_output.pending_structured_output_tokens: + # We aren't waiting for any tokens, get any grammar output + # and sample immediately. grammar_output = self.scheduler.get_grammar_bitmask( scheduler_output ) - # Block-wait for execute to return (continues running async on the GPU). - with self.log_error_detail(scheduler_output): - exec_result = exec_future.result() - - if exec_result is None: - with record_function_or_nullcontext( - "core step_with_batch_queue: sample_tokens" - ): - # Call sample tokens. - future = self.model_executor.sample_tokens( - grammar_output, non_block=True - ) + future = self.model_executor.sample_tokens( + grammar_output, non_block=True + ) else: - # No sampling required (e.g. all requests finished). - future = cast(Future[ModelRunnerOutput], exec_future) + # We need to defer sampling until we have processed the model output + # from the prior step. + deferred_scheduler_output = scheduler_output + + if not deferred_scheduler_output: # Add this step's future to the queue. batch_queue.appendleft((future, scheduler_output)) if ( @@ -440,34 +429,27 @@ def step_with_batch_queue( # only be called when the scheduler contains requests or the queue # is non-empty. return None, False - with record_function_or_nullcontext("core step_with_batch_queue: model_output"): - # Block until the next result is available. - future, scheduler_output = batch_queue.pop() - with self.log_error_detail(scheduler_output): - model_output = future.result() - with record_function_or_nullcontext( - "core step_with_batch_queue: update_from_output" - ): - engine_core_outputs = self.scheduler.update_from_output( - scheduler_output, model_output - ) + + # Block until the next result is available. + future, scheduler_output = batch_queue.pop() + with self.log_error_detail(scheduler_output): + model_output = future.result() + + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, model_output + ) # NOTE(nick): We can either handle the deferred tasks here or save # in a field and do it immediately once step_with_batch_queue is # re-called. The latter slightly favors TTFT over TPOT/throughput. if deferred_scheduler_output: - with record_function_or_nullcontext( - "core step_with_batch_queue: deferred_scheduler_output" - ): - # We now have the tokens needed to compute the bitmask for the - # deferred request. Get the bitmask and call sample tokens. - grammar_output = self.scheduler.get_grammar_bitmask( - deferred_scheduler_output - ) - future = self.model_executor.sample_tokens( - grammar_output, non_block=True - ) - batch_queue.appendleft((future, deferred_scheduler_output)) + # We now have the tokens needed to compute the bitmask for the + # deferred request. Get the bitmask and call sample tokens. + grammar_output = self.scheduler.get_grammar_bitmask( + deferred_scheduler_output + ) + future = self.model_executor.sample_tokens(grammar_output, non_block=True) + batch_queue.appendleft((future, deferred_scheduler_output)) return engine_core_outputs, model_executed diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index 119e4c081831..55db7445c9c7 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -99,6 +99,11 @@ def _init_executor(self) -> None: # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None + self.ec_producer = ( + self.vllm_config.ec_transfer_config is not None + and self.vllm_config.ec_transfer_config.is_ec_producer + ) + self.scheduler_output: SchedulerOutput | None = None @property @@ -395,6 +400,12 @@ def execute_model( # type: ignore[override] "State error: sample_tokens() must be called " "after execute_model() returns None." ) + + if self.ec_producer or not scheduler_output.total_num_scheduled_tokens: + # Model will not execute, call model runner immediately. + return self._execute_dag(scheduler_output, None, non_block) + + # Model will execute, defer to sample_tokens() call. self.scheduler_output = scheduler_output return COMPLETED_NONE_FUTURE if non_block else None @@ -417,10 +428,18 @@ def sample_tokens( # type: ignore[override] """ scheduler_output = self.scheduler_output if scheduler_output is None: - return None # noqa + return COMPLETED_NONE_FUTURE if non_block else None # noqa self.scheduler_output = None + return self._execute_dag(scheduler_output, grammar_output, non_block) + + def _execute_dag( + self, + scheduler_output: SchedulerOutput, + grammar_output: "GrammarOutput | None", + non_block: bool = False, + ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: # Build the compiled DAG for the first time. if self.forward_dag is None: # type: ignore self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 758e3e1b3a82..2a8ff746f112 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -7,7 +7,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager -from copy import deepcopy +from copy import copy, deepcopy from functools import reduce from itertools import product from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast @@ -250,7 +250,6 @@ class ExecuteModelState(NamedTuple): hidden_states: torch.Tensor sample_hidden_states: torch.Tensor aux_hidden_states: list[torch.Tensor] | None - kv_connector_output: KVConnectorOutput | None ec_connector_output: ECConnectorOutput | None @@ -573,6 +572,7 @@ def __init__( # Ephemeral state transferred between execute_model() and sample_tokens(). self.execute_model_state: ExecuteModelState | None = None + self.kv_connector_output: KVConnectorOutput | None = None def reset_mm_cache(self) -> None: if self.mm_budget: @@ -2803,6 +2803,7 @@ def execute_model( # Return the intermediate tensors. assert isinstance(hidden_states, IntermediateTensors) hidden_states.kv_connector_output = kv_connector_output + self.kv_connector_output = kv_connector_output return hidden_states if self.is_pooling_model: @@ -2853,18 +2854,31 @@ def execute_model( hidden_states, sample_hidden_states, aux_hidden_states, - kv_connector_output, ec_connector_output, ) + self.kv_connector_output = kv_connector_output return None @torch.inference_mode def sample_tokens( self, grammar_output: "GrammarOutput | None" ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: + kv_connector_output = self.kv_connector_output + self.kv_connector_output = None + if self.execute_model_state is None: # Nothing to do (PP non-final rank case), output isn't used. - return None # noqa + if not kv_connector_output: + return None # noqa + + # In case of PP with kv transfer, we need to pass through the + # kv_connector_output + if kv_connector_output.is_empty(): + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.kv_connector_output = kv_connector_output + return output # Unpack ephemeral state. ( @@ -2875,7 +2889,6 @@ def sample_tokens( hidden_states, sample_hidden_states, aux_hidden_states, - kv_connector_output, ec_connector_output, ) = self.execute_model_state # Clear ephemeral state. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 42a844d96558..315f01b68499 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" -import copy import gc import os from contextlib import AbstractContextManager, nullcontext @@ -45,7 +44,6 @@ from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ( - EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput, @@ -581,18 +579,7 @@ def execute_model( all_gather_tensors=all_gather_tensors, ) - kv_connector_output = output.kv_connector_output - if not kv_connector_output: - return None - - # In case of PP with kv transfer, we need to pass through the - # kv_connector_output - if kv_connector_output.is_empty(): - return EMPTY_MODEL_RUNNER_OUTPUT - - output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - output.kv_connector_output = kv_connector_output - return output + return None def take_draft_token_ids(self) -> DraftTokenIds | None: return self.model_runner.take_draft_token_ids() From 95ae50b7d1bf3b5b66ac39b19d3169bad5443f2e Mon Sep 17 00:00:00 2001 From: Shreyas Kulkarni Date: Mon, 17 Nov 2025 18:01:34 -0500 Subject: [PATCH 135/578] [Quantization] [Eagle] Add complete quantization support to the draft model in Eagle (#28435) Signed-off-by: Shreyas Kulkarni --- .../model_executor/test_eagle_quantization.py | 169 ++++++++++++++++++ vllm/model_executor/models/llama_eagle.py | 53 ++++-- vllm/model_executor/models/llama_eagle3.py | 62 +++++-- vllm/model_executor/models/utils.py | 27 +++ 4 files changed, 282 insertions(+), 29 deletions(-) create mode 100644 tests/model_executor/test_eagle_quantization.py diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py new file mode 100644 index 000000000000..1ab75933ee31 --- /dev/null +++ b/tests/model_executor/test_eagle_quantization.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import Mock, patch + +import pytest +import torch + +from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig +from vllm.model_executor.models.utils import get_draft_quant_config +from vllm.platforms import current_platform + +DEVICES = ( + [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] + if current_platform.is_cuda_alike() + else ["cpu"] +) + + +def test_get_draft_quant_config_with_draft_model(): + mock_draft_model_config = Mock(spec=ModelConfig) + mock_load_config = Mock(spec=LoadConfig) + mock_speculative_config = Mock(spec=SpeculativeConfig) + mock_speculative_config.draft_model_config = mock_draft_model_config + + mock_vllm_config = Mock(spec=VllmConfig) + mock_vllm_config.speculative_config = mock_speculative_config + mock_vllm_config.load_config = mock_load_config + + mock_quant_config = Mock() + with patch.object( + VllmConfig, "get_quantization_config", return_value=mock_quant_config + ): + result = get_draft_quant_config(mock_vllm_config) + + # Verify the function calls get_quantization_config with draft model config + VllmConfig.get_quantization_config.assert_called_once_with( + mock_draft_model_config, mock_load_config + ) + assert result == mock_quant_config + + +def test_get_draft_quant_config_without_draft_model(): + mock_speculative_config = Mock(spec=SpeculativeConfig) + mock_speculative_config.draft_model_config = None + + mock_vllm_config = Mock(spec=VllmConfig) + mock_vllm_config.speculative_config = mock_speculative_config + mock_vllm_config.load_config = Mock(spec=LoadConfig) + + result = get_draft_quant_config(mock_vllm_config) + + assert result is None + + +@torch.inference_mode() +@pytest.mark.parametrize("device", DEVICES) +def test_fc_layer_quant_config_usage(dist_init, device) -> None: + import torch + + from vllm.model_executor.layers.linear import ReplicatedLinear + + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + + torch.set_default_device(device) + + input_size = 256 + output_size = 128 + + fc_no_quant = ReplicatedLinear( + input_size=input_size, + output_size=output_size, + bias=False, + params_dtype=torch.float16, + quant_config=None, + prefix="fc", + ) + + assert fc_no_quant.quant_config is None + assert fc_no_quant.input_size == input_size + assert fc_no_quant.output_size == output_size + + mock_quant_config = Mock() + fc_with_quant = ReplicatedLinear( + input_size=input_size, + output_size=output_size, + bias=False, + params_dtype=torch.float16, + quant_config=mock_quant_config, + prefix="fc", + ) + + assert fc_with_quant.quant_config == mock_quant_config + + # Check forward pass + x = torch.randn(2, input_size, dtype=torch.float16) + output, _ = fc_no_quant(x) + assert output.shape == (2, output_size) + + +def test_kv_cache_scale_name_handling(): + # Mock a quant config that supports cache scales + mock_quant_config = Mock() + mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale") + + # Condition check in load_weights + name = "layers.0.self_attn.k_proj.weight" + scale_name = mock_quant_config.get_cache_scale(name) + + # Check if get_cache_scale is called and returns expected value + mock_quant_config.get_cache_scale.assert_called_once_with(name) + assert scale_name == "layers.0.self_attn.kv_scale" + + +def test_kv_cache_scale_name_no_scale(): + # Mock a quant config that returns None for get_cache_scale + mock_quant_config = Mock() + mock_quant_config.get_cache_scale = Mock(return_value=None) + + name = "layers.0.mlp.gate_proj.weight" + scale_name = mock_quant_config.get_cache_scale(name) + + # Should return None for weights that don't have cache scales + assert scale_name is None + + +def test_maybe_remap_kv_scale_name(): + from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name + + params_dict = { + "layers.0.self_attn.kv_scale": Mock(), + "layers.1.self_attn.kv_scale": Mock(), + } + + name = "layers.0.self_attn.some_scale" + remapped = maybe_remap_kv_scale_name(name, params_dict) + + assert remapped in params_dict or remapped == name or remapped is None + + +def test_load_weights_kv_scale_handling(): + kv_scale_param = Mock() + kv_scale_param.weight_loader = Mock() + + params_dict = { + "layers.0.self_attn.kv_scale": kv_scale_param, + } + + mock_quant_config = Mock() + mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale") + + # Load_weights logic for KV cache scales + name = "layers.0.self_attn.k_proj.weight" + loaded_weight_tensor = torch.tensor([1.0, 2.0]) + + if mock_quant_config is not None: + scale_name = mock_quant_config.get_cache_scale(name) + if scale_name: + param = params_dict[scale_name] + assert param is kv_scale_param + weight_to_load = ( + loaded_weight_tensor + if loaded_weight_tensor.dim() == 0 + else loaded_weight_tensor[0] + ) + + assert scale_name == "layers.0.self_attn.kv_scale" + assert weight_to_load == loaded_weight_tensor[0] diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 0287132c5637..90ab5c50361b 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -11,13 +11,22 @@ from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM -from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight +from .utils import ( + AutoWeightsLoader, + get_draft_quant_config, + maybe_prefix, + process_eagle_weight, +) logger = init_logger(__name__) @@ -40,14 +49,7 @@ def __init__( def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None: """Use drafter's quantization config instead of verifier's.""" - draft_model_config = vllm_config.speculative_config.draft_model_config - draft_load_config = vllm_config.load_config - - return ( - VllmConfig.get_quantization_config(draft_model_config, draft_load_config) - if draft_model_config - else None - ) + return get_draft_quant_config(vllm_config) @support_torch_compile @@ -63,6 +65,9 @@ def __init__( self.config = vllm_config.speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size + # Get drafter's quantization config + self.quant_config = get_draft_quant_config(vllm_config) + self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, self.config.hidden_size, @@ -80,8 +85,14 @@ def __init__( for i in range(self.config.num_hidden_layers) ] ) - self.fc = torch.nn.Linear( - self.config.hidden_size * 2, self.config.hidden_size, bias=False + self.fc = ReplicatedLinear( + input_size=self.config.hidden_size * 2, + output_size=self.config.hidden_size, + bias=False, + params_dtype=vllm_config.model_config.dtype, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "fc"), + return_bias=False, ) def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: @@ -117,6 +128,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: + # Handle kv cache quantization scales + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + # Remapping the name FP8 kv-scale + if "scale" in name: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index a3bcc5eeb32b..75c671311b49 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -11,19 +11,27 @@ from vllm.config import VllmConfig, get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import QKVParallelLinear +from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors -from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight +from .utils import ( + AutoWeightsLoader, + get_draft_quant_config, + maybe_prefix, + process_eagle_weight, +) logger = init_logger(__name__) @@ -66,14 +74,7 @@ def __init__( def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None: """Use drafter's quantization config instead of verifier's.""" - draft_model_config = vllm_config.speculative_config.draft_model_config - draft_load_config = vllm_config.load_config - - return ( - VllmConfig.get_quantization_config(draft_model_config, draft_load_config) - if draft_model_config - else None - ) + return get_draft_quant_config(vllm_config) def _norm_before_residual( self, hidden_states: torch.Tensor @@ -140,6 +141,9 @@ def __init__( self.config = vllm_config.speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size + # Get drafter's quantization config + self.quant_config = get_draft_quant_config(vllm_config) + current_vllm_config = get_current_vllm_config() self.embed_tokens = VocabParallelEmbedding( @@ -160,13 +164,19 @@ def __init__( ] ) if hasattr(self.config, "target_hidden_size"): - self.fc = torch.nn.Linear( - self.config.target_hidden_size * 3, self.config.hidden_size, bias=False - ) + fc_input_size = self.config.target_hidden_size * 3 else: - self.fc = torch.nn.Linear( - self.config.hidden_size * 3, self.config.hidden_size, bias=False - ) + fc_input_size = self.config.hidden_size * 3 + self.fc = ReplicatedLinear( + input_size=fc_input_size, + output_size=self.config.hidden_size, + bias=False, + params_dtype=vllm_config.model_config.dtype, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "fc"), + return_bias=False, + ) + self.norm = RMSNorm( self.config.hidden_size, eps=self.config.rms_norm_eps, @@ -211,6 +221,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for name, loaded_weight in weights: if "midlayer." in name: name = name.replace("midlayer.", "layers.0.") + # Handle kv cache quantization scales + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + # Remapping the name FP8 kv-scale + if "scale" in name: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0d811fbc7585..ca5af358e2ee 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -18,6 +18,9 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, +) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import supports_any_eagle from vllm.multimodal import NestedTensors @@ -715,6 +718,30 @@ def maybe_prefix(prefix: str, name: str) -> str: return name if not prefix else f"{prefix}.{name}" +def get_draft_quant_config( + vllm_config: VllmConfig, +) -> QuantizationConfig | None: + """Get quantization config for Draft models. + + Draft models should use their own quantization config instead of the verifier/target + model's config. This helper retrieves the draft model's quantization config. + + Args: + vllm_config: The vLLM configuration object. + + Returns: + The draft model's config if available, None otherwise. + """ + draft_model_config = vllm_config.speculative_config.draft_model_config + draft_load_config = vllm_config.load_config + + return ( + VllmConfig.get_quantization_config(draft_model_config, draft_load_config) + if draft_model_config + else None + ) + + def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int: """ Extract the layer index from the module name. From a289cc1dde4a1aeee05492bbe4cc39a18f070135 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 17 Nov 2025 18:09:47 -0500 Subject: [PATCH 136/578] [Test] Batch Invariant: Rename and organize tests (#27421) Signed-off-by: yewentao256 --- tests/v1/determinism/conftest.py | 11 ++ .../test_batch_invariance.py | 75 +------- .../test_online_batch_invariance.py | 161 ++++++++++++++++++ .../test_rms_norm_batch_invariant.py | 7 +- tests/v1/determinism/utils.py | 74 ++++++++ 5 files changed, 248 insertions(+), 80 deletions(-) create mode 100644 tests/v1/determinism/conftest.py rename tests/v1/{generation => determinism}/test_batch_invariance.py (92%) create mode 100644 tests/v1/determinism/test_online_batch_invariance.py rename tests/v1/{generation => determinism}/test_rms_norm_batch_invariant.py (97%) create mode 100644 tests/v1/determinism/utils.py diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py new file mode 100644 index 000000000000..3c2136e00584 --- /dev/null +++ b/tests/v1/determinism/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + + +@pytest.fixture(autouse=True) +def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch): + """Automatically enable batch invariant kernel overrides for all tests.""" + monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") + yield diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py similarity index 92% rename from tests/v1/generation/test_batch_invariance.py rename to tests/v1/determinism/test_batch_invariance.py index 8fd038bca5d0..f018ee551dbf 100644 --- a/tests/v1/generation/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -6,66 +6,9 @@ import pytest import torch +from utils import _extract_step_logprobs, _random_prompt, skip_unsupported from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -skip_unsupported = pytest.mark.skipif( - not (current_platform.is_cuda() and current_platform.has_device_capability(90)), - reason="Requires CUDA and >= Hopper (SM90)", -) - - -@pytest.fixture(autouse=True) -def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch): - """Automatically enable batch invariant kernel overrides for all tests.""" - monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") - yield - - -def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: - # Generate more realistic prompts that will actually produce varied tokens - # Use a mix of common English text patterns - - prompt_templates = [ - # Question-answer style - "Question: What is the capital of France?\nAnswer: The capital of France is", - "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which", - "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is", - # Story/narrative style - "Once upon a time in a distant galaxy, there lived", - "The old man walked slowly down the street, remembering", - "In the year 2157, humanity finally discovered", - # Technical/code style - "To implement a binary search tree in Python, first we need to", - "The algorithm works by iterating through the array and", - "Here's how to optimize database queries using indexing:", - # Factual/informative style - "The Renaissance was a period in European history that", - "Climate change is caused by several factors including", - "The human brain contains approximately 86 billion neurons which", - # Conversational style - "I've been thinking about getting a new laptop because", - "Yesterday I went to the store and bought", - "My favorite thing about summer is definitely", - ] - - # Pick a random template - base_prompt = random.choice(prompt_templates) - - if max_words < min_words: - max_words = min_words - target_words = random.randint(min_words, max_words) - - if target_words > 50: - # For longer prompts, repeat context - padding_text = ( - " This is an interesting topic that deserves more explanation. " - * (target_words // 50) - ) - base_prompt = base_prompt + padding_text - - return base_prompt @skip_unsupported @@ -204,22 +147,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( llm_bsN.shutdown() -def _extract_step_logprobs(request_output): - if getattr(request_output, "outputs", None): - inner = request_output.outputs[0] - if hasattr(inner, "logprobs") and inner.logprobs is not None: - t = torch.tensor( - [ - inner.logprobs[i][tid].logprob - for i, tid in enumerate(inner.token_ids) - ], - dtype=torch.float32, - ) - return t, inner.token_ids - - return None, None - - @skip_unsupported @pytest.mark.parametrize( "backend", diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py new file mode 100644 index 000000000000..23f47863dd23 --- /dev/null +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +HTTP-based batch invariance test: send requests to a running +vLLM server and compare BS=1 vs BS=N results (tokens and per-step logprobs). + +Environment variables: + - VLLM_TEST_MODEL: served model name (e.g., Qwen/Qwen3-1.7B / DeepSeek-R1) + - VLLM_TP_SIZE: tensor parallelism size (e.g., 4) + +""" + +import os +import random +import sys +from typing import Any + +import openai +from utils import _random_prompt, skip_unsupported + +from tests.utils import RemoteOpenAIServer + + +def _request_completion( + client: openai.OpenAI, + model: str, + prompt: Any, + sp: dict[str, Any], + max_retries: int = 3, + retry_backoff: float = 0.5, +) -> dict[str, Any] | None: + payload: dict[str, Any] = {"model": model, "prompt": prompt} + payload.update(sp) + + for attempt in range(max_retries + 1): + try: + completion = client.completions.create(**payload) + # Convert to plain dict so downstream logic can keep using + # dict-style access just like with raw HTTP JSON. + return completion.model_dump() + except Exception as e: # pragma: no cover + if attempt < max_retries: + import time as _t + + _t.sleep(retry_backoff * (2**attempt)) + continue + sys.stderr.write(f"Error: {e}\n") + return None + return None + + +def _extract_tokens_and_logprobs( + choice: dict[str, Any], +) -> tuple[list[Any], list[float] | None]: + tokens: list[Any] = [] + token_logprobs: list[float] | None = None + lp = choice.get("logprobs") + if lp and isinstance(lp, dict): + tokens = lp.get("token_ids") or lp.get("tokens") or [] + token_logprobs = lp.get("token_logprobs", None) + return tokens, token_logprobs + + +def _compare_bs1_vs_bsn_single_process( + prompts: list[str], + sp_kwargs: dict[str, Any], + client: openai.OpenAI, + model_name: str, +) -> None: + # BS=1 + bs1_tokens_per_prompt: list[list[Any]] = [] + bs1_logprobs_per_prompt: list[list[float] | None] = [] + for p in prompts: + resp = _request_completion(client, model_name, p, sp_kwargs) + if resp is None or not resp.get("choices"): + raise AssertionError("BS=1 empty/failed response") + choice = resp["choices"][0] + toks, lps = _extract_tokens_and_logprobs(choice) + if lps is None: + raise AssertionError( + "logprobs not returned; ensure server supports 'logprobs'" + ) + bs1_tokens_per_prompt.append(list(toks)) + bs1_logprobs_per_prompt.append(list(lps)) + + # BS=N + bsN_tokens_per_prompt: list[list[Any]] = [None] * len(prompts) # type: ignore[list-item] + bsN_logprobs_per_prompt: list[list[float] | None] = [None] * len(prompts) + resp = _request_completion(client, model_name, prompts, sp_kwargs) + if resp is None or not resp.get("choices"): + raise AssertionError("BS=N empty/failed batched response") + choices = resp.get("choices", []) + if len(choices) != len(prompts): + raise AssertionError( + f"BS=N choices length {len(choices)} != num prompts {len(prompts)}" + ) + for idx, choice in enumerate(choices): + toks, lps = _extract_tokens_and_logprobs(choice) + if lps is None: + raise AssertionError(f"BS=N missing logprobs for prompt {idx}") + bsN_tokens_per_prompt[idx] = list(toks) + bsN_logprobs_per_prompt[idx] = list(lps) + + # compare + for i, (tokens_bs1, tokens_bsN, logprobs_bs1, logprobs_bsN) in enumerate( + zip( + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + ) + ): + if tokens_bs1 != tokens_bsN: + raise AssertionError( + f"Prompt {i} (sampling): Different tokens sampled. " + f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}" + ) + if logprobs_bs1 is None or logprobs_bsN is None: + raise AssertionError(f"Prompt {i}: Missing logprobs in one of the runs") + if len(logprobs_bs1) != len(logprobs_bsN): + raise AssertionError( + f"Prompt {i}: Different number of steps: " + f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)." + ) + for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): + if a != b: + diff = abs(a - b) + raise AssertionError( + f"Prompt {i} Step {t}: Bitwise mismatch " + f"(abs diff={diff:.6e}). " + f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}" + ) + + +@skip_unsupported +def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(): + random.seed(int(os.getenv("VLLM_TEST_SEED", "12345"))) + model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + prompts_all = [_random_prompt(10, 50) for _ in range(32)] + + sp_kwargs: dict[str, Any] = { + "temperature": 0.6, + "top_p": 1.0, + "max_tokens": 8, + "seed": 42, + "logprobs": 5, + } + + tp_size = os.getenv("VLLM_TP_SIZE", "1") + server_args: list[str] = [] + if tp_size: + server_args += ["-tp", tp_size] + + with RemoteOpenAIServer(model_name, server_args) as server: + client = server.get_client() + _compare_bs1_vs_bsn_single_process( + prompts=prompts_all, + sp_kwargs=sp_kwargs, + client=client, + model_name=model_name, + ) diff --git a/tests/v1/generation/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py similarity index 97% rename from tests/v1/generation/test_rms_norm_batch_invariant.py rename to tests/v1/determinism/test_rms_norm_batch_invariant.py index f79eba58d6ef..390872519528 100644 --- a/tests/v1/generation/test_rms_norm_batch_invariant.py +++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py @@ -9,15 +9,10 @@ import pytest import torch +from utils import skip_unsupported from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform - -skip_unsupported = pytest.mark.skipif( - not (current_platform.is_cuda() and current_platform.has_device_capability(90)), - reason="Requires CUDA and >= Hopper (SM90)", -) @skip_unsupported diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py new file mode 100644 index 000000000000..5141837faea0 --- /dev/null +++ b/tests/v1/determinism/utils.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random + +import pytest +import torch + +from vllm.platforms import current_platform + +skip_unsupported = pytest.mark.skipif( + not (current_platform.is_cuda() and current_platform.has_device_capability(90)), + reason="Requires CUDA and >= Hopper (SM90)", +) + + +def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: + # Generate more realistic prompts that will actually produce varied tokens + # Use a mix of common English text patterns + + prompt_templates = [ + # Question-answer style + "Question: What is the capital of France?\nAnswer: The capital of France is", + "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which", + "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is", + # Story/narrative style + "Once upon a time in a distant galaxy, there lived", + "The old man walked slowly down the street, remembering", + "In the year 2157, humanity finally discovered", + # Technical/code style + "To implement a binary search tree in Python, first we need to", + "The algorithm works by iterating through the array and", + "Here's how to optimize database queries using indexing:", + # Factual/informative style + "The Renaissance was a period in European history that", + "Climate change is caused by several factors including", + "The human brain contains approximately 86 billion neurons which", + # Conversational style + "I've been thinking about getting a new laptop because", + "Yesterday I went to the store and bought", + "My favorite thing about summer is definitely", + ] + + # Pick a random template + base_prompt = random.choice(prompt_templates) + + if max_words < min_words: + max_words = min_words + target_words = random.randint(min_words, max_words) + + if target_words > 50: + # For longer prompts, repeat context + padding_text = ( + " This is an interesting topic that deserves more explanation. " + * (target_words // 50) + ) + base_prompt = base_prompt + padding_text + + return base_prompt + + +def _extract_step_logprobs(request_output): + if getattr(request_output, "outputs", None): + inner = request_output.outputs[0] + if hasattr(inner, "logprobs") and inner.logprobs is not None: + t = torch.tensor( + [ + inner.logprobs[i][tid].logprob + for i, tid in enumerate(inner.token_ids) + ], + dtype=torch.float32, + ) + return t, inner.token_ids + + return None, None From f77bce001a6261da0661f0192c8cddd1ca453750 Mon Sep 17 00:00:00 2001 From: Pranav <56645758+pranav4501@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:11:20 -0800 Subject: [PATCH 137/578] [Model] Add Afmoe architecture implementation (#28332) Signed-off-by: Maziyar Panahi Signed-off-by: Pranav Co-authored-by: Maziyar Panahi --- docs/models/supported_models.md | 1 + tests/models/registry.py | 4 + vllm/model_executor/models/afmoe.py | 711 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/afmoe.py | 84 +++ 7 files changed, 804 insertions(+) create mode 100644 vllm/model_executor/models/afmoe.py create mode 100644 vllm/transformers_utils/configs/afmoe.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d47aeaab511b..bd14bbb9ab66 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -351,6 +351,7 @@ th { | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|-------------------|----------------------|---------------------------| +| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ | | `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 644d0619215f..094f921e4305 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -173,6 +173,10 @@ def check_available_online( _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] + "AfmoeForCausalLM": _HfExamplesInfo( + "arcee-ai/Trinity-Nano", + is_available_online=False, + ), "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py new file mode 100644 index 000000000000..6f654f47495f --- /dev/null +++ b/vllm/model_executor/models/afmoe.py @@ -0,0 +1,711 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only AfMoE model compatible with HuggingFace weights.""" + +import typing +from collections.abc import Callable, Iterable +from itertools import islice +from typing import Any + +import torch +from torch import nn + +from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_world_size, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP +from vllm.model_executor.models.llama import LlamaMLP as AfmoeMLP +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + PPMissingLayer, + WeightsMapper, + extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) +from vllm.sequence import IntermediateTensors + +logger = init_logger(__name__) + + +class AfmoeMoE(nn.Module): + def __init__( + self, + config, # AfmoeConfig + quant_config: QuantizationConfig | None = None, + prefix: str = "", + enable_eplb: bool = False, + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.route_scale = config.route_scale + self.score_func = config.score_func + self.route_norm = config.route_norm + + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts: int = config.num_experts + self.n_shared_experts: int = config.num_shared_experts + + if config.hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now." + ) + + # Router gate + self.gate = nn.Linear( + config.hidden_size, + config.num_experts, + bias=False, + dtype=torch.float32, + ) + self.expert_bias = nn.Parameter( + torch.empty(config.num_experts, dtype=torch.float32) + ) + + # Load balancing settings + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = enable_eplb + + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = self.ep_rank * self.n_local_physical_experts + self.physical_expert_end = ( + self.physical_expert_start + self.n_local_physical_experts + ) + + self.shared_experts = None + # Shared experts + if config.num_shared_experts > 0: + intermediate_size = config.moe_intermediate_size * config.num_shared_experts + self.shared_experts = AfmoeMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_experts", + ) + + # Routed experts using SharedFusedMoE + self.experts = SharedFusedMoE( + shared_experts=self.shared_experts, + num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.route_norm if self.score_func == "sigmoid" else False, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=self.score_func, + routed_scaling_factor=self.route_scale, + e_score_correction_bias=self.expert_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + router_logits = self.gate(hidden_states.to(dtype=torch.float32)) + + fused_moe_out = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if self.shared_experts is not None: + shared_output, final_hidden_states = fused_moe_out + final_hidden_states = final_hidden_states + shared_output + else: + final_hidden_states = fused_moe_out + if self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class AfmoeAttention(nn.Module): + def __init__( + self, + config, # AfmoeConfig + layer_idx: int, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: dict[str, Any] | None = None, + max_position_embeddings: int = 131072, + head_dim: int | None = None, + rms_norm_eps: float = 1e-05, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + # Check if this is a local attention layer + self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention" + self.sliding_window = config.sliding_window if self.is_local_attention else None + + self.qkv_proj = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + # Gating projection + self.gate_proj = ColumnParallelLinear( + hidden_size, + self.total_num_heads * self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_proj", + ) + + # Q/K normalization + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + # Only create rotary embeddings for local attention + if self.is_local_attention: + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=True, + ) + else: + self.rotary_emb = None + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=self.sliding_window, + prefix=f"{prefix}.attn", + attn_type=attn_type, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + gate, _ = self.gate_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # Apply Q/K normalization + q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape) + k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape( + k.shape + ) + + # Apply rotary embeddings only for local attention + if self.is_local_attention and self.rotary_emb is not None: + q, k = self.rotary_emb(positions, q, k) + + attn_output = self.attn(q, k, v) + + # Apply gating + attn_output = attn_output * torch.sigmoid(gate) + output, _ = self.o_proj(attn_output) + return output + + +class AfmoeDecoderLayer(nn.Module): + def __init__( + self, + config, # AfmoeConfig + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + enable_eplb: bool = False, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None + ): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings + ) + max_position_embeddings = getattr(config, "max_position_embeddings", 131072) + + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + self.layer_idx = extract_layer_index(prefix) + + self.self_attn = AfmoeAttention( + config=config, + layer_idx=self.layer_idx, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + head_dim=config.head_dim, + rms_norm_eps=config.rms_norm_eps, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + # MoE or dense FFN + self.moe_enabled = self.layer_idx >= config.num_dense_layers + if self.moe_enabled: + self.mlp = AfmoeMoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, + ) + else: + self.mlp = AfmoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + hidden_states = self.post_attention_layernorm(hidden_states) # attn norm b + + # Fully Connected + hidden_states, residual = self.pre_mlp_layernorm( # ffn norm a + hidden_states, residual + ) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_layernorm(hidden_states) # ffn norm b + + return hidden_states, residual + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) +class AfmoeModel(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + enable_eplb = vllm_config.parallel_config.enable_eplb + self.config = config + + self.vocab_size = config.vocab_size + self.mup_enabled = config.mup_enabled + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens" + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: AfmoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + enable_eplb=enable_eplb, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + + # Apply muP input scaling if enabled + if self.mup_enabled: + hidden_states = hidden_states * (self.config.hidden_size**0.5) + + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, device: torch.device + ) -> IntermediateTensors: + return IntermediateTensors( + { + "hidden_states": torch.zeros( + (batch_size, self.config.hidden_size), dtype=dtype, device=device + ), + "residual": torch.zeros( + (batch_size, self.config.hidden_size), dtype=dtype, device=device + ), + } + ) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return SharedFusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if (weight_name not in name) or ("self_attn.gate_proj" in name): + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name_mapped, self): + continue + + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast( + Callable[..., bool], param.weight_loader + ) + success = weight_loader( + param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + + +class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_suffix={ + ".router.gate.weight": ".gate.weight", + }, + ) + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = AfmoeModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=quant_config + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers + self.num_expert_groups = config.n_group + + self.moe_layers: list[SharedFusedMoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, AfmoeDecoderLayer) + if layer.moe_enabled: + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None and self.num_moe_layers > 0: + raise RuntimeError("No AfmoeMoE layer found in model.layers.") + + if example_moe is not None: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4af8fa01f562..6e9790de49bf 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -56,6 +56,7 @@ _TEXT_GENERATION_MODELS = { # [Decoder-only] + "AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"), "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"), "AquilaModel": ("llama", "LlamaForCausalLM"), "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index b7418cfb7cc7..49250e071eab 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -77,6 +77,7 @@ def __getitem__(self, key): _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( + afmoe="AfmoeConfig", chatglm="ChatGLMConfig", deepseek_vl_v2="DeepseekVLV2Config", deepseek_v32=DeepseekV3Config, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index ac612b255143..dcae05a15fec 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -7,6 +7,7 @@ - There is a need to override the existing config to support vLLM. """ +from vllm.transformers_utils.configs.afmoe import AfmoeConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig @@ -40,6 +41,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ + "AfmoeConfig", "ChatGLMConfig", "DeepseekVLV2Config", "DotsOCRConfig", diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py new file mode 100644 index 000000000000..9b634fd037a3 --- /dev/null +++ b/vllm/transformers_utils/configs/afmoe.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers.configuration_utils import PretrainedConfig + + +class AfmoeConfig(PretrainedConfig): + model_type = "afmoe" + + def __init__( + self, + vocab_size: int = 200_192, + hidden_size: int = 2048, + intermediate_size: int = 6144, + moe_intermediate_size: int = 1408, + num_hidden_layers: int = 32, + num_dense_layers: int = 1, + num_attention_heads: int = 16, + num_key_value_heads: int | None = None, + head_dim: int = 128, + hidden_act: str = "silu", + max_position_embeddings: int = 131072, + initializer_range: float = 0.02, + rms_norm_eps: float = 1e-5, + use_cache: bool = True, + tie_word_embeddings: bool = False, + rope_theta: float = 10000.0, + rope_scaling: dict | None = None, + num_experts: int = 64, + num_experts_per_tok: int = 6, + num_shared_experts: int = 2, + num_expert_groups: int = 1, + num_limited_groups: int = 1, + score_func: str = "sigmoid", + route_norm: bool = True, + route_scale: float = 1.0, + global_attn_every_n_layers: int = 4, + sliding_window: int = 2048, + layer_types: list[str] | None = None, + attention_dropout: float = 0.0, + mup_enabled: bool = False, + n_group: int = 1, + topk_group: int = 1, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_dense_layers = num_dense_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads or num_attention_heads + self.head_dim = head_dim + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.moe_intermediate_size = moe_intermediate_size + self.num_experts = num_experts + self.num_experts_per_tok = num_experts_per_tok + self.num_shared_experts = num_shared_experts + self.num_expert_groups = num_expert_groups + self.num_limited_groups = num_limited_groups + self.score_func = score_func + self.route_norm = route_norm + self.route_scale = route_scale + + self.global_attn_every_n_layers = global_attn_every_n_layers + self.sliding_window = sliding_window + self.layer_types = layer_types + self.attention_dropout = attention_dropout + + self.mup_enabled = mup_enabled + self.n_group = n_group + self.topk_group = topk_group + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +__all__ = ["AfmoeConfig"] From 61485844fc5190721b1edf6bed9aa4d5567b70e0 Mon Sep 17 00:00:00 2001 From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:22:11 -0800 Subject: [PATCH 138/578] [BugFix] Corner case that could cause out-of-sync with external launcher mode and dp >1 (#28774) --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2a8ff746f112..0102ca4739ad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2663,6 +2663,18 @@ def execute_model( return make_empty_encoder_model_runner_output(scheduler_output) if not num_scheduled_tokens: + if ( + self.parallel_config.distributed_executor_backend + == "external_launcher" + and self.parallel_config.data_parallel_size > 1 + ): + # this is a corner case when both external launcher + # and DP are enabled, num_scheduled_tokens could be + # 0, and has_unfinished_requests in the outer loop + # returns True. before returning early here we call + # dummy run to ensure coordinate_batch_across_dp + # is called into to avoid out of sync issues. + self._dummy_run(1) if not has_kv_transfer_group(): # Return empty ModelRunnerOutput if no work to do. return EMPTY_MODEL_RUNNER_OUTPUT From 552cac95b5da283844a9994b94d4b1308a0a0565 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 17 Nov 2025 15:32:22 -0800 Subject: [PATCH 139/578] [Misc] Fix wrong comment in scheduler (#28880) Signed-off-by: Zhuohan Li --- vllm/v1/core/sched/scheduler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 61640e856ac1..4323141c435b 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -472,9 +472,9 @@ def schedule(self) -> SchedulerOutput: num_computed_tokens = ( num_new_local_computed_tokens + num_external_computed_tokens ) - # KVTransfer: WAITING reqs have num_computed_tokens > 0 - # after async KV recvs are completed. else: + # KVTransfer: WAITING reqs have num_computed_tokens > 0 + # after async KV recvs are completed. new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks num_new_local_computed_tokens = 0 num_computed_tokens = request.num_computed_tokens @@ -483,12 +483,12 @@ def schedule(self) -> SchedulerOutput: external_load_encoder_input = [] new_encoder_compute_budget = encoder_compute_budget - # KVTransfer: loading remote KV, do not allocate for new work. if load_kv_async: + # KVTransfer: loading remote KV, do not allocate for new work. assert num_external_computed_tokens > 0 num_new_tokens = 0 - # Number of tokens to be scheduled. else: + # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens. From b6e04390d3ea5ebc79ac70d1b76d638c56fa8ce2 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Tue, 18 Nov 2025 03:13:25 +0000 Subject: [PATCH 140/578] [Bugfix] Fix Kimi-K2 tool parser concatenated tool calls parsing (#28831) Signed-off-by: Thomas Mao Signed-off-by: bbartels Co-authored-by: Thomas Mao Co-authored-by: Chauncey --- tests/tool_use/test_kimi_k2_tool_parser.py | 122 ++++++++++++++++++ .../tool_parsers/kimi_k2_tool_parser.py | 3 +- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py index 33dabbc7e7b9..3a48b5206141 100644 --- a/tests/tool_use/test_kimi_k2_tool_parser.py +++ b/tests/tool_use/test_kimi_k2_tool_parser.py @@ -60,6 +60,11 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser): ids=[ "tool_call_with_content_before", "multi_tool_call_with_content_before", + "concatenated_tool_calls_bug_fix", + "three_concatenated_tool_calls", + "mixed_spacing_tool_calls", + "angle_brackets_in_json", + "newlines_in_json", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -114,6 +119,123 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser): ], "I'll help you check the weather. ", ), + ( + """I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"latitude": 34.0522, "longitude": -118.2437}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"content": "Los Angeles today"}<|tool_call_end|><|tool_calls_section_end|>""", + [ + ToolCall( + id="functions.get_weather:0", + function=FunctionCall( + name="get_weather", + arguments=json.dumps( + {"latitude": 34.0522, "longitude": -118.2437} + ), + ), + type="function", + ), + ToolCall( + id="functions.get_news:1", + function=FunctionCall( + name="get_news", + arguments=json.dumps({"content": "Los Angeles today"}), + ), + type="function", + ), + ], + "I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. ", + ), + ( + """I'll help you with multiple tasks. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "New York"}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"topic": "technology"}<|tool_call_end|><|tool_call_begin|>functions.send_email:2<|tool_call_argument_begin|>{"to": "user@example.com", "subject": "Daily Update"}<|tool_call_end|><|tool_calls_section_end|>""", + [ + ToolCall( + id="functions.get_weather:0", + function=FunctionCall( + name="get_weather", + arguments=json.dumps({"city": "New York"}), + ), + type="function", + ), + ToolCall( + id="functions.get_news:1", + function=FunctionCall( + name="get_news", + arguments=json.dumps({"topic": "technology"}), + ), + type="function", + ), + ToolCall( + id="functions.send_email:2", + function=FunctionCall( + name="send_email", + arguments=json.dumps( + {"to": "user@example.com", "subject": "Daily Update"} + ), + ), + type="function", + ), + ], + "I'll help you with multiple tasks. ", + ), + ( + """Mixed spacing test. <|tool_calls_section_begin|> <|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {} <|tool_call_end|><|tool_call_begin|>functions.test2:1<|tool_call_argument_begin|>{}<|tool_call_end|> <|tool_calls_section_end|>""", + [ + ToolCall( + id="functions.test:0", + function=FunctionCall( + name="test", + arguments=json.dumps({}), + ), + type="function", + ), + ToolCall( + id="functions.test2:1", + function=FunctionCall( + name="test2", + arguments=json.dumps({}), + ), + type="function", + ), + ], + "Mixed spacing test. ", + ), + ( + """I need to process HTML content. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_html:0<|tool_call_argument_begin|>{"html": "
content
", "text": "normal text"}<|tool_call_end|><|tool_calls_section_end|>""", + [ + ToolCall( + id="functions.process_html:0", + function=FunctionCall( + name="process_html", + arguments=json.dumps( + {"html": "
content
", "text": "normal text"} + ), + ), + type="function", + ) + ], + "I need to process HTML content. ", + ), + ( + """I need to process formatted JSON. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_data:0<|tool_call_argument_begin|>{ + "name": "test", + "value": 123, + "nested": { + "key": "value" + } +}<|tool_call_end|><|tool_calls_section_end|>""", + [ + ToolCall( + id="functions.process_data:0", + function=FunctionCall( + name="process_data", + arguments=json.dumps( + {"name": "test", "value": 123, "nested": {"key": "value"}}, + indent=2, + ), + ), + type="function", + ) + ], + "I need to process formatted JSON. ", + ), ], ) def test_extract_tool_calls( diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index a84c9e454716..2b84c60a3b84 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -60,7 +60,8 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_call_end_token: str = "<|tool_call_end|>" self.tool_call_regex = re.compile( - r"<\|tool_call_begin\|>\s*(?P.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*?)\s*<\|tool_call_end\|>" + r"<\|tool_call_begin\|>\s*(?P[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>", + re.DOTALL, ) self.stream_tool_call_portion_regex = re.compile( From 88ab591f0b20c28cb167fd65d10ccade99d873ae Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 17 Nov 2025 22:16:03 -0500 Subject: [PATCH 141/578] Run macos smoke test workflow on main commit (#28752) Signed-off-by: Michael Goin Signed-off-by: mgoin --- .github/workflows/macos-smoke-test.yml | 15 ++++++++++----- requirements/cpu-build.txt | 5 +++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 8d40aa587bf0..42b05ecd5ac0 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -1,6 +1,9 @@ name: macOS Apple Silicon Smoke Test on: + push: + branches: + - main workflow_dispatch: # Manual trigger jobs: @@ -19,13 +22,15 @@ jobs: pyproject.toml python-version: '3.12' - - name: Install dependencies + - name: Create virtual environment run: | - uv pip install -r requirements/cpu-build.txt - uv pip install -r requirements/cpu.txt + uv venv + echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH" - - name: Build vLLM - run: uv pip install -v -e . + - name: Install dependencies and build vLLM + run: | + uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match + uv pip install -e . env: CMAKE_BUILD_PARALLEL_LEVEL: 4 diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt index 331d02be6621..81d429a5e5f8 100644 --- a/requirements/cpu-build.txt +++ b/requirements/cpu-build.txt @@ -4,8 +4,9 @@ packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.8.0+cpu; platform_machine == "x86_64" -torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" +torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.9.0; platform_system == "Darwin" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL) wheel jinja2>=3.1.6 From d0a73620cc85a840323d25b28772efac04c006e2 Mon Sep 17 00:00:00 2001 From: xuebwang-amd Date: Tue, 18 Nov 2025 11:16:45 +0800 Subject: [PATCH 142/578] [ROCm][Quantization] add apply_vllm_mapper in quark config for models like gpt-oss (#28638) Signed-off-by: xuebwang-amd Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../layers/quantization/quark/quark.py | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 1bb698faf46d..f59e5e2a0af7 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -32,6 +32,7 @@ deep_compare, should_ignore_layer, ) +from vllm.model_executor.models.utils import WeightsMapper from vllm.platforms import current_platform if TYPE_CHECKING: @@ -57,7 +58,6 @@ def __init__( self.kv_cache_group = kv_cache_group self.kv_cache_config = kv_cache_config self.pack_method = pack_method - self.ignore: list[str] = cast(list[str], self.quant_config.get("exclude", [])) def get_linear_method(self) -> "QuarkLinearMethod": return QuarkLinearMethod(self) @@ -72,14 +72,42 @@ def get_min_capability(cls) -> int: def get_name(self) -> QuantizationMethods: return "quark" + def apply_vllm_mapper( # noqa: B027 + self, hf_to_vllm_mapper: "WeightsMapper" + ): + """ + Interface for models to update module names referenced in + quantization configs in order to reflect the vllm model structure + + :param hf_to_vllm_mapper: maps from hf model structure (the assumed + structure of the qconfig) to vllm model structure + """ + quant_config_with_hf_to_vllm_mapper = {} + + for k, v in self.quant_config.items(): + if isinstance(v, list): + quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v) + elif isinstance(v, dict): + quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v) + else: + if isinstance(v, str): + mapped_v_list = hf_to_vllm_mapper.apply_list([v]) + if mapped_v_list: + quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0] + else: + quant_config_with_hf_to_vllm_mapper[k] = v + + self.quant_config = quant_config_with_hf_to_vllm_mapper + def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization. + exclude_layers = cast(list[str], self.quant_config.get("exclude")) if should_ignore_layer( - prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping + prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping ): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): @@ -93,9 +121,6 @@ def get_quant_method( return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix) return None - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) - @classmethod def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": export_config = config.get("export") From 3ddcf4601171797b6e63eda6b5956136441b3408 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 17 Nov 2025 23:29:29 -0500 Subject: [PATCH 143/578] [Refactor] Remove Unused Func in Batch Invariant (#28881) Signed-off-by: yewentao256 --- vllm/model_executor/layers/batch_invariant.py | 73 ------------------- 1 file changed, 73 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 746a543ab827..7920d117de5e 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import contextlib import os -from collections import namedtuple from collections.abc import Callable from functools import cache from typing import Any @@ -725,10 +723,6 @@ def linear_batch_invariant(input, weight, bias=None): _original_cublaslt_workspace_size = None -def is_batch_invariant_mode_enabled(): - return _batch_invariant_MODE - - def enable_batch_invariant_mode(): global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm global _original_fp16_reduction_precision, _original_bf16_reduction_precision @@ -791,73 +785,6 @@ def enable_batch_invariant_mode(): torch.backends.cuda.preferred_blas_library(backend="cublaslt") -def disable_batch_invariant_mode(): - global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm - global _original_fp16_reduction_precision, _original_bf16_reduction_precision - global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size - if not _batch_invariant_MODE: - return - - if _batch_invariant_LIB is not None: - _batch_invariant_LIB._destroy() - if _original_torch_bmm is not None: - torch.bmm = _original_torch_bmm - _original_torch_bmm = None - - if _original_bf16_reduction_precision is not None: - torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = ( - _original_bf16_reduction_precision - ) - _original_bf16_reduction_precision = None - if _original_fp16_reduction_precision is not None: - torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( - _original_fp16_reduction_precision - ) - _original_fp16_reduction_precision = None - - torch.backends.cuda.preferred_blas_library(backend="default") - - if not is_torch_equal_or_newer("2.10.0.dev"): - # Set cublas env vars to previous results. If previous results are None, - # that means the env vars were not set, so we should remove them. - if _original_cublas_workspace_cfg: - os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg - elif "CUBLAS_WORKSPACE_CONFIG" in os.environ: - del os.environ["CUBLAS_WORKSPACE_CONFIG"] - - if _original_cublaslt_workspace_size: - os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size - elif "CUBLASLT_WORKSPACE_SIZE" in os.environ: - del os.environ["CUBLASLT_WORKSPACE_SIZE"] - - _original_cublas_workspace_cfg = None - _original_cublaslt_workspace_size = None - - _batch_invariant_MODE = False - _batch_invariant_LIB = None - - -@contextlib.contextmanager -def set_batch_invariant_mode(enabled: bool = True): - global _batch_invariant_MODE, _batch_invariant_LIB - old_data = (_batch_invariant_MODE, _batch_invariant_LIB) - if enabled: - enable_batch_invariant_mode() - else: - disable_batch_invariant_mode() - yield - if _batch_invariant_LIB is not None: - _batch_invariant_LIB._destroy() - _batch_invariant_MODE, _batch_invariant_LIB = old_data - - -AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"]) - - -def get_batch_invariant_attention_block_size() -> AttentionBlockSize: - return AttentionBlockSize(block_m=16, block_n=16) - - @cache def vllm_is_batch_invariant(): env_key = "VLLM_BATCH_INVARIANT" From bf9e1e8767fb4d1143b7e042ed940b84ef031c66 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 18 Nov 2025 12:30:29 +0800 Subject: [PATCH 144/578] [Bugfix] Fix wrong CLI defaults for dynamic `SchedulerConfig` fields (#28872) Signed-off-by: DarkLight1337 --- .../openai/test_enable_force_include_usage.py | 4 ++-- vllm/engine/arg_utils.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 3ddf2308eb1d..9d527c45c1fa 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request): # noqa: F811 "128", "--enforce-eager", "--max-num-seqs", - "1", + "4", "--enable-force-include-usage", "--port", "55857", @@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage(): "--dtype", "bfloat16", "--max-num-seqs", - "1", + "4", "--enforce-eager", "--enable-force-include-usage", "--gpu-memory-utilization", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d011dfdbfbb2..ab6e5e594c23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1046,10 +1046,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: description=SchedulerConfig.__doc__, ) scheduler_group.add_argument( - "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"] + "--max-num-batched-tokens", + **{ + **scheduler_kwargs["max_num_batched_tokens"], + "default": None, + }, ) scheduler_group.add_argument( - "--max-num-seqs", **scheduler_kwargs["max_num_seqs"] + "--max-num-seqs", + **{ + **scheduler_kwargs["max_num_seqs"], + "default": None, + }, ) scheduler_group.add_argument( "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"] @@ -1071,7 +1079,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--scheduling-policy", **scheduler_kwargs["policy"] ) scheduler_group.add_argument( - "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"] + "--enable-chunked-prefill", + **{ + **scheduler_kwargs["enable_chunked_prefill"], + "default": None, + }, ) scheduler_group.add_argument( "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"] From 083cf326dc9ce92aae6b85fcef678a28e867afe9 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Tue, 18 Nov 2025 05:32:14 +0100 Subject: [PATCH 145/578] [Doc]: fix typos in various files (#28863) Signed-off-by: Didier Durand --- docs/contributing/profiling.md | 2 +- docs/design/io_processor_plugins.md | 2 +- docs/design/logits_processors.md | 4 ++-- docs/features/disagg_prefill.md | 2 +- docs/features/lora.md | 2 +- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 7941b1f49ee8..7634cc0859ed 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -224,6 +224,6 @@ snakeviz expensive_function.prof Leverage VLLM_GC_DEBUG environment variable to debug GC costs. -- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times +- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5 collected objects for each gc.collect diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index 2f4b17f191a5..91ab4deae71d 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -1,6 +1,6 @@ # IO Processor Plugins -IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output. +IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output. When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index acf7fc245462..8eadeb386fcf 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous - * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch @@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th Review these logits processor implementations for guidance on writing built-in logits processors. -Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model. +Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model. * Allowed token IDs diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index 3e8cb87e37d3..fd4f249f2ec6 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on We recommend three ways of implementations: -- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. +- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. - **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL. - **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`. diff --git a/docs/features/lora.md b/docs/features/lora.md index 3a85b52d89b6..d42a3cef76bd 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09 LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA]. -Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save +Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save them locally with ```python diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 893972144e99..e2dd47dbb4e6 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -154,7 +154,7 @@ def _fused_moe_lora_kernel( k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K) # pre-fetch lora weight b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0) - # GDC wait waits for ALL programs in the the prior kernel to complete + # GDC wait waits for ALL programs in the prior kernel to complete # before continuing. if USE_GDC and not IS_PRIMARY: tl.extra.cuda.gdc_wait() From 0168f69e50898fd5f09ac64a0d735039e57e7806 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 18 Nov 2025 12:33:46 +0800 Subject: [PATCH 146/578] [Misc] Remove unnecessary parentheses from log statements (#28897) Signed-off-by: Andy Xie --- vllm/model_executor/models/registry.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 6e9790de49bf..a2de597c87d8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -597,7 +597,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None: mi_dict = json.load(file) except FileNotFoundError: logger.debug( - ("Cached model info file for class %s.%s not found"), + "Cached model info file for class %s.%s not found", self.module_name, self.class_name, ) @@ -605,7 +605,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None: if mi_dict["hash"] != module_hash: logger.debug( - ("Cached model info file for class %s.%s is stale"), + "Cached model info file for class %s.%s is stale", self.module_name, self.class_name, ) @@ -615,7 +615,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None: return _ModelInfo(**mi_dict["modelinfo"]) except Exception: logger.debug( - ("Cached model info for class %s.%s error. "), + "Cached model info for class %s.%s error. ", self.module_name, self.class_name, ) @@ -650,14 +650,14 @@ def inspect_model_cls(self) -> _ModelInfo: mi = self._load_modelinfo_from_cache(module_hash) if mi is not None: logger.debug( - ("Loaded model info for class %s.%s from cache"), + "Loaded model info for class %s.%s from cache", self.module_name, self.class_name, ) return mi else: logger.debug( - ("Cache model info for class %s.%s miss. Loading model instead."), + "Cache model info for class %s.%s miss. Loading model instead.", self.module_name, self.class_name, ) From 5bdd15527770ef39cc4c3cdca008fb4f9cf8a15f Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 17 Nov 2025 21:26:32 -0800 Subject: [PATCH 147/578] [CI] Fix async scheduling + spec decoding test flake (#28902) Signed-off-by: Nick Hill --- tests/v1/e2e/test_async_scheduling.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index f732b05f09f9..00d93e1ba0b5 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -84,6 +84,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): "num_speculative_tokens": 2, "model": "nm-testing/Llama3_2_1B_speculator.eagle3", } + # Set small draft model len to force doesn't-fit-in-drafter case. spec_config_short = spec_config | {"max_model_len": 50} # test_preemption, executor, async_scheduling, @@ -174,13 +175,14 @@ def run_tests( ): if "spec_mml=None" in test_config: assert ( - pytest.approx(test_acceptance_rate, rel=5e-2) - == base_acceptance_rate + test_acceptance_rate > base_acceptance_rate + or test_acceptance_rate + == pytest.approx(base_acceptance_rate, rel=5e-2) ) else: # Currently the reported acceptance rate is expected to be # lower when we sometimes skip drafting altogether. - assert test_acceptance_rate > 0.05 + assert test_acceptance_rate > 0.1 print( f"PASSED: config=[{test_config}], params={params}" f" accept_rate={test_acceptance_rate}" From 5bb1da5190b54aefb08478c6b1170f97722b8bdb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Nov 2025 13:28:31 +0800 Subject: [PATCH 148/578] [MISC] Remove format.sh (#28906) Signed-off-by: Kuntai Du --- format.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 format.sh diff --git a/format.sh b/format.sh deleted file mode 100755 index 6ba93e0a19ba..000000000000 --- a/format.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -echo "vLLM linting system has been moved from format.sh to pre-commit hooks." -echo "Please run 'pip install -r requirements/lint.txt', followed by" -echo "'pre-commit install' to install the pre-commit hooks." -echo "Then linters will run automatically before each commit." \ No newline at end of file From 896e41ae04d18b0f984eefbb41b920aa7505f5d1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 18 Nov 2025 16:10:55 +0800 Subject: [PATCH 149/578] [CI/Build] Replace wikipedia url with local server ones (#28908) Signed-off-by: Isotr0py --- tests/entrypoints/openai/test_metrics.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index dbcec9d31fc9..4e7b765d7713 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -16,6 +16,7 @@ from vllm import version +from ...conftest import LocalAssetServer from ...utils import RemoteOpenAIServer MODELS = { @@ -69,7 +70,6 @@ async def client(server): _PROMPT = "Hello my name is Robert and I love magic" -_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int): @@ -250,6 +250,7 @@ async def test_metrics_counts( @pytest.mark.asyncio async def test_metrics_exist( + local_asset_server: LocalAssetServer, server: RemoteOpenAIServer, client: openai.AsyncClient, model_key: str, @@ -265,13 +266,21 @@ async def test_metrics_exist( temperature=0.0, ) else: + # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg await client.chat.completions.create( model=model_name, messages=[ { "role": "user", "content": [ - {"type": "image_url", "image_url": {"url": _IMAGE_URL}}, + { + "type": "image_url", + "image_url": { + "url": local_asset_server.url_for( + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + ), + }, + }, {"type": "text", "text": "What's in this image?"}, ], } From 439368496db48d8f992ba8c606a0c0b1eebbfa69 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 18 Nov 2025 00:20:45 -0800 Subject: [PATCH 150/578] [BugFix] Fix PP/async scheduling with pooling models (#28899) Signed-off-by: Nick Hill Co-authored-by: Cyrus Leung --- vllm/v1/engine/core.py | 3 ++- vllm/v1/executor/ray_executor.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 97286c6e2e5e..d49eb752d56a 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -184,6 +184,7 @@ def __init__( vllm_config.ec_transfer_config is not None and vllm_config.ec_transfer_config.is_ec_producer ) + self.is_pooling_model = vllm_config.model_config.runner_type == "pooling" self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None: @@ -392,7 +393,7 @@ def step_with_batch_queue( if not self.ec_producer: model_executed = scheduler_output.total_num_scheduled_tokens > 0 - if not model_executed: + if self.is_pooling_model or not model_executed: # No sampling required (no requests scheduled). future = cast(Future[ModelRunnerOutput], exec_future) else: diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index 55db7445c9c7..406eafcd339b 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -99,9 +99,9 @@ def _init_executor(self) -> None: # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None - self.ec_producer = ( - self.vllm_config.ec_transfer_config is not None - and self.vllm_config.ec_transfer_config.is_ec_producer + self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and ( + self.vllm_config.ec_transfer_config is None + or not self.vllm_config.ec_transfer_config.is_ec_producer ) self.scheduler_output: SchedulerOutput | None = None @@ -401,7 +401,7 @@ def execute_model( # type: ignore[override] "after execute_model() returns None." ) - if self.ec_producer or not scheduler_output.total_num_scheduled_tokens: + if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens: # Model will not execute, call model runner immediately. return self._execute_dag(scheduler_output, None, non_block) From 285eaa42857ba2a8f377fdd0dcd84120260d8f65 Mon Sep 17 00:00:00 2001 From: Song Zhixin Date: Tue, 18 Nov 2025 18:53:44 +0800 Subject: [PATCH 151/578] [Bugfix] Safeguard against missing backend in AttentionBackendEnum (#28846) Signed-off-by: jesse Signed-off-by: Song Zhixin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/attention/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 37f9a4b383ce..a8e796a1eab6 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -310,7 +310,8 @@ def __init__( kv_sharing_target_layer_name, **extra_impl_args, ) - self.backend = AttentionBackendEnum[self.attn_backend.get_name()] + backend_name = self.attn_backend.get_name() + self.backend = AttentionBackendEnum.__members__.get(backend_name) self.dtype = dtype # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how From b9489f51e1c61c96378e12c9523f9de7043ca294 Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Tue, 18 Nov 2025 19:51:54 +0800 Subject: [PATCH 152/578] [Model][Perf] Use cos and sin cache in QwenVL (#28798) Signed-off-by: gcanlin --- .../layers/rotary_embedding/base.py | 5 + vllm/model_executor/models/glm4_1v.py | 88 +++++------- vllm/model_executor/models/qwen2_5_vl.py | 123 ++++++++-------- vllm/model_executor/models/qwen2_vl.py | 135 ++++++------------ .../models/qwen3_omni_moe_thinker.py | 40 ++++-- vllm/model_executor/models/qwen3_vl.py | 44 ++++-- 6 files changed, 218 insertions(+), 217 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index ce4f40680b0a..4114b21168cc 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -83,6 +83,11 @@ def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None: ): self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) + def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]: + cos_sin = self.cos_sin_cache[:seqlen] + cos, sin = cos_sin.chunk(2, dim=-1) + return cos, sin + class RotaryEmbedding(RotaryEmbeddingBase): def __init__( diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 6953b805653b..65c3fc2d9e97 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -65,6 +65,7 @@ RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY @@ -341,7 +342,8 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention seqlens: list[int] | None = None, # Only used for xFormers ) -> torch.Tensor: @@ -353,10 +355,12 @@ def forward( batch_size = q.shape[1] q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) - if rotary_pos_emb is not None: + if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None: # [2 * b, s, heads, head_dim] qk_concat = torch.cat([q, k], dim=0) - qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + qk_rotated = apply_rotary_pos_emb_vision( + qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin + ) q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: @@ -454,14 +458,16 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention seqlens: list[int] | None = None, # Only used for xFormers ) -> torch.Tensor: x_attn = self.attn( self.norm1(x), cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) @@ -660,44 +666,6 @@ def forward( return embeddings -class Glm4vVisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - self.dim = dim - self.theta = theta - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self._seq_len_cached = 0 - self._freqs_cached = None - - def update_freqs_cache(self, seqlen: int) -> None: - if seqlen > self._seq_len_cached: - seqlen *= 2 - self._seq_len_cached = seqlen - self.inv_freq = 1.0 / ( - self.theta - ** ( - torch.arange( - 0, - self.dim, - 2, - dtype=torch.float, - device=self.inv_freq.device, - ) - / self.dim - ) - ) - seq = torch.arange( - seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype - ) - freqs = torch.outer(seq, self.inv_freq) - self._freqs_cached = freqs - - def forward(self, seqlen: int) -> torch.Tensor: - self.update_freqs_cache(seqlen) - return self._freqs_cached[:seqlen] - - class Glm4vVisionTransformer(nn.Module): def __init__( self, @@ -731,7 +699,13 @@ def __init__( norm_layer = partial(RMSNorm, eps=norm_eps) head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2) + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) self.blocks = nn.ModuleList( [ Glm4vVisionBlock( @@ -789,7 +763,9 @@ def dtype(self) -> torch.dtype: def device(self) -> torch.device: return self.patch_embed.proj.weight.device - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + def rot_pos_emb( + self, grid_thw: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: pos_ids = [] for t, h, w in grid_thw: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) @@ -817,9 +793,18 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb, pos_ids + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + return cos_combined, sin_combined, pos_ids def compute_attn_mask_seqlen( self, @@ -848,7 +833,9 @@ def forward( x = self.post_conv_layernorm(x) # compute position embedding - rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw) + rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb( + grid_thw + ) # compute cu_seqlens cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] @@ -867,7 +854,8 @@ def forward( x = blk( x, cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 897dd7ef29f1..2e4fd9645d88 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -64,6 +64,7 @@ RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.vision import should_torch_compile_mm_vit @@ -363,7 +364,8 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: @@ -378,13 +380,15 @@ def forward( head=self.num_attention_heads_per_partition, ) - if rotary_pos_emb is not None: + if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None: qk, v = qkv[:, :, :2], qkv[:, :, 2] qk_reshaped = einops.rearrange( qk, "b s two head head_dim -> (two b) s head head_dim", two=2 ) - qk_rotated = apply_rotary_pos_emb_vision(qk_reshaped, rotary_pos_emb) + qk_rotated = apply_rotary_pos_emb_vision( + qk_reshaped, cos=rotary_pos_emb_cos, sin=rotary_pos_emb_sin + ) qk_rotated = qk_rotated.view( 2, batch_size, @@ -434,7 +438,8 @@ def forward( dynamic_arg_dims={ "x": 0, "cu_seqlens": 0, - "rotary_pos_emb": 0, + "rotary_pos_emb_cos": 0, + "rotary_pos_emb_sin": 0, "seqlens": 0, }, mark_unbacked_dims={"seqlens": 0}, @@ -485,14 +490,16 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x_attn = self.attn( self.norm1(x), cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) @@ -588,42 +595,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return out -class Qwen2_5_VisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - self.dim = dim - self.theta = theta - inv_freq = 1.0 / ( - theta ** (torch.arange(0, dim, 2, dtype=torch.float, device="cpu") / dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self._seq_len_cached = 0 - self._freqs_cached = None - - def update_freqs_cache(self, seqlen: int) -> None: - if seqlen > self._seq_len_cached: - seqlen *= 2 - self._seq_len_cached = seqlen - self.inv_freq = 1.0 / ( - self.theta - ** ( - torch.arange( - 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device - ) - / self.dim - ) - ) - seq = torch.arange( - seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype - ) - freqs = torch.outer(seq, self.inv_freq) - self._freqs_cached = freqs - - def forward(self, seqlen: int) -> torch.Tensor: - self.update_freqs_cache(seqlen) - return self._freqs_cached[:seqlen] - - class Qwen2_5_VisionTransformer(nn.Module): def __init__( self, @@ -666,7 +637,13 @@ def __init__( norm_layer = partial(RMSNorm, eps=norm_eps) head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) use_upstream_fa = False self.attn_backend = get_vit_attn_backend( @@ -757,15 +734,30 @@ def rotary_pos_emb_thw(self, t, h, w): ) pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) max_size = max(h, w) - rotary_pos_emb_full = self.rotary_pos_emb(max_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - rotary_pos_emb = rotary_pos_emb.reshape( - rotary_pos_emb.shape[0] // self.spatial_merge_unit, + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + + cos_combined = cos_combined.reshape( + cos_combined.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, + -1, + ) + sin_combined = sin_combined.reshape( + sin_combined.shape[0] // self.spatial_merge_unit, self.spatial_merge_unit, -1, ) - return rotary_pos_emb + return cos_combined, sin_combined def get_window_index_thw(self, grid_t, grid_h, grid_w): vit_merger_window_size = ( @@ -807,14 +799,19 @@ def get_window_index_thw(self, grid_t, grid_h, grid_w): @lru_cache(maxsize=1024) # noqa: B019 def get_rope_by_thw(self, t, h, w): window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w) - rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w) - rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :] - rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1) + cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w) + + cos_thw = cos_thw[window_index_thw, :, :] + cos_thw = cos_thw.flatten(start_dim=0, end_dim=1) + sin_thw = sin_thw[window_index_thw, :, :] + sin_thw = sin_thw.flatten(start_dim=0, end_dim=1) + cu_seqlens_thw = torch.repeat_interleave( torch.tensor([h * w], dtype=torch.int32), t ) return ( - rotary_pos_emb_thw, + cos_thw, + sin_thw, window_index_thw, cu_seqlens_window_thw, cu_seqlens_thw, @@ -849,7 +846,8 @@ def forward( ) -> torch.Tensor: # patchify seq_len, _ = x.size() - rotary_pos_emb = [] + rotary_pos_emb_cos = [] + rotary_pos_emb_sin = [] window_index: list = [] cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] cu_seqlens: list = [] @@ -865,7 +863,8 @@ def forward( llm_w = w // self.spatial_merge_size ( - rotary_pos_emb_thw, + cos_thw, + sin_thw, window_index_thw, cu_seqlens_window_thw, cu_seqlens_thw, @@ -878,11 +877,13 @@ def forward( cu_window_seqlens_last = cu_seqlens_window_thw[-1] cu_window_seqlens.append(cu_seqlens_window_thw) - rotary_pos_emb.append(rotary_pos_emb_thw) + rotary_pos_emb_cos.append(cos_thw) + rotary_pos_emb_sin.append(sin_thw) cu_seqlens.append(cu_seqlens_thw) - rotary_pos_emb = torch.cat(rotary_pos_emb) + rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos) + rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin) window_index = torch.cat(window_index) # compute reverse indices reverse_indices = self.invert_permutation(window_index) @@ -901,7 +902,12 @@ def forward( cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True) - rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) + rotary_pos_emb_cos = rotary_pos_emb_cos.to( + device=self.device, non_blocking=True + ) + rotary_pos_emb_sin = rotary_pos_emb_sin.to( + device=self.device, non_blocking=True + ) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True @@ -928,7 +934,8 @@ def forward( hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens_now, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen_now, seqlens=seqlens_now, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5d21e249fc4c..53df5972a8fe 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -32,7 +32,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange, repeat +from einops import rearrange from transformers import BatchFeature from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor from transformers.models.qwen2_vl.configuration_qwen2_vl import ( @@ -59,7 +59,9 @@ RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding.common import ( + apply_rotary_emb_torch, dispatch_rotary_emb_function, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -275,47 +277,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: - if not interleaved: - x1, x2 = x.chunk(2, dim=-1) - return torch.cat((-x2, x1), dim=-1) - else: - x1, x2 = x[..., ::2], x[..., 1::2] - return rearrange( - torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2 - ) - - -def apply_rotary_emb_torch( - x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False +def apply_rotary_pos_emb_vision( + t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor ) -> torch.Tensor: - """ - x: (batch_size, seqlen, nheads, headdim) - cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) - """ - ro_dim = cos.shape[-1] * 2 - assert ro_dim <= x.shape[-1] - cos = repeat( - cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)" - ) - sin = repeat( - sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)" + rotary_emb_function = dispatch_rotary_emb_function( + default=partial(apply_rotary_emb_torch, is_neox_style=True) ) - return torch.cat( - [ - x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, - x[..., ro_dim:], - ], - dim=-1, - ) - - -def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: - rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch) - t_ = t.float() - cos = freqs.cos() - sin = freqs.sin() - output = rotary_emb_function(t_, cos, sin).type_as(t) + output = rotary_emb_function(t, cos, sin).type_as(t) return output @@ -412,7 +380,8 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention seqlens: list[int] | None = None, # Only used for xFormers ) -> torch.Tensor: @@ -424,11 +393,13 @@ def forward( batch_size = q.shape[1] q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v)) - if rotary_pos_emb is not None: - # [2 * b, s, heads, head_dim] - qk_concat = torch.cat([q, k], dim=0) - qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) - q, k = torch.chunk(qk_rotated, 2, dim=0) + + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision( + qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin + ) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -534,14 +505,16 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention seqlens: list[int] | None = None, # Only used for xFormers ) -> torch.Tensor: x = x + self.attn( self.norm1(x), cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) @@ -628,40 +601,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return out -class Qwen2VisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - self.dim = dim - self.theta = theta - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self._seq_len_cached = 0 - self._freqs_cached = None - - def update_freqs_cache(self, seqlen: int) -> None: - if seqlen > self._seq_len_cached: - seqlen *= 2 - self._seq_len_cached = seqlen - self.inv_freq = 1.0 / ( - self.theta - ** ( - torch.arange( - 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device - ) - / self.dim - ) - ) - seq = torch.arange( - seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype - ) - freqs = torch.outer(seq, self.inv_freq) - self._freqs_cached = freqs - - def forward(self, seqlen: int) -> torch.Tensor: - self.update_freqs_cache(seqlen) - return self._freqs_cached[:seqlen] - - class Qwen2VisionTransformer(nn.Module): def __init__( self, @@ -700,7 +639,13 @@ def __init__( norm_layer = partial(nn.LayerNorm, eps=norm_eps) head_dim = embed_dim // num_heads - self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2) + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) self.blocks = nn.ModuleList( [ @@ -744,7 +689,9 @@ def dtype(self) -> torch.dtype: def device(self) -> torch.device: return self.patch_embed.proj.weight.device - def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: + def rot_pos_emb( + self, grid_thw: list[list[int]] + ) -> tuple[torch.Tensor, torch.Tensor]: pos_ids = [] max_grid_size = 0 for t, h, w in grid_thw: @@ -773,9 +720,18 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) max_grid_size = max(max_grid_size, h, w) pos_ids = torch.cat(pos_ids, dim=0) - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + return cos_combined, sin_combined def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor @@ -806,7 +762,7 @@ def forward( grid_thw_list = grid_thw.tolist() # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw_list) + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) # compute cu_seqlens cu_seqlens = torch.repeat_interleave( @@ -824,7 +780,8 @@ def forward( x = blk( x, cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 40b80ce2387c..8274b92138f7 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -60,6 +60,7 @@ ) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo @@ -90,7 +91,6 @@ ) from .qwen2_5_vl import ( Qwen2_5_VisionAttention, - Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VLProcessingInfo, ) from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel @@ -221,14 +221,16 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x = x + self.attn( self.norm1(x), cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) @@ -332,7 +334,13 @@ def __init__( norm_layer = partial(nn.LayerNorm, eps=norm_eps) head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) self.blocks = nn.ModuleList( [ @@ -416,9 +424,19 @@ def rot_pos_emb(self, grid_thw): pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + + return cos_combined, sin_combined def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: num_grid_per_side = self.num_grid_per_side @@ -508,7 +526,7 @@ def forward( if self.apply_vit_abs_pos_embed: pos_embeds = self.fast_pos_embed_interpolate(grid_thw) hidden_states = hidden_states + pos_embeds - rotary_pos_emb = self.rot_pos_emb(grid_thw) + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw) cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] @@ -519,7 +537,8 @@ def forward( cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) hidden_states = hidden_states.unsqueeze(1) - rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device) + rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device) max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) hidden_states_list = [] @@ -529,7 +548,8 @@ def forward( hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 7f0c9372991d..99a4007ef7f2 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -63,6 +63,7 @@ ) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -95,7 +96,6 @@ ) from .qwen2_5_vl import ( Qwen2_5_VisionAttention, - Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, @@ -232,14 +232,16 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x = x + self.attn( self.norm1(x), cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) @@ -339,7 +341,13 @@ def __init__( norm_layer = partial(nn.LayerNorm, eps=norm_eps) head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) self.merger = Qwen3_VisionPatchMerger( d_model=vision_config.out_hidden_size, @@ -452,9 +460,19 @@ def rot_pos_emb(self, grid_thw: list[list[int]]): for t, h, w in grid_thw ] pos_ids = torch.cat(pos_ids, dim=0) - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + + return cos_combined, sin_combined def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: num_grid_per_side = self.num_grid_per_side @@ -547,8 +565,13 @@ def forward( pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds - rotary_pos_emb = self.rot_pos_emb(grid_thw_list) - rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True) + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) + rotary_pos_emb_cos = rotary_pos_emb_cos.to( + hidden_states.device, non_blocking=True + ) + rotary_pos_emb_sin = rotary_pos_emb_sin.to( + hidden_states.device, non_blocking=True + ) cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] @@ -564,7 +587,8 @@ def forward( hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, seqlens=seqlens, ) From 184b12fdc6dce87485e3bd793e13e90421f93924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 18 Nov 2025 15:07:50 +0100 Subject: [PATCH 153/578] [Bugfix][NIXL] Fix `block_size_ratio` when logical !=physical blocks (#28925) Signed-off-by: NickLucche Co-authored-by: Cyrus Leung --- .../kv_connector/v1/nixl_connector.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index a70c98b63713..5ff95876ef34 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -677,12 +677,13 @@ class TpKVTopology: mapping between local and remote TP workers. """ - tp_size: int tp_rank: int remote_tp_size: dict[EngineId, int] is_mla: bool total_num_kv_heads: int attn_backend: type[AttentionBackend] + engine_id: EngineId + remote_block_size: dict[EngineId, int] def __post_init__(self): # Figure out whether the first dimension of the cache is K/V @@ -710,8 +711,13 @@ def split_k_and_v(self) -> bool: self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first ) - block_size: int - remote_block_size: dict[EngineId, int] + @property + def tp_size(self) -> int: + return self.remote_tp_size[self.engine_id] + + @property + def block_size(self) -> int: + return self.remote_block_size[self.engine_id] def tp_ratio( self, @@ -957,13 +963,12 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.xfer_stats = NixlKVConnectorStats() self.kv_topo = self.TpKVTopology( - tp_size=self.world_size, tp_rank=self.tp_rank, + engine_id=self.engine_id, remote_tp_size=self._tp_size, # shared state + remote_block_size=self._block_size, # shared state is_mla=self.use_mla, total_num_kv_heads=self.model_config.get_total_num_kv_heads(), - block_size=self.block_size, - remote_block_size=self._block_size, attn_backend=backend, ) self._use_pallas = self.kv_topo._use_pallas @@ -1185,6 +1190,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self.block_size // kernel_block_size ) self.block_size = kernel_block_size + self._block_size[self.engine_id] = kernel_block_size seen_base_addresses.append(base_addr) curr_tensor_size_bytes = cache.numel() * cache.element_size() From f6aa122698790fb1a544e8d80ba97c49f02be945 Mon Sep 17 00:00:00 2001 From: Alex <30671301+killershrimp@users.noreply.github.com> Date: Tue, 18 Nov 2025 08:21:48 -0600 Subject: [PATCH 154/578] [CI Sprint] Quantization CI Cleanup (#24130) Signed-off-by: Alex Yun --- tests/quantization/test_compressed_tensors.py | 4 ++-- tests/quantization/test_cpu_offload.py | 16 ++++++++-------- tests/quantization/test_experts_int8.py | 6 ++++-- tests/quantization/test_fp8.py | 13 ++++++++----- tests/quantization/test_ipex_quant.py | 4 ++-- tests/quantization/test_lm_head.py | 2 +- tests/quantization/test_modelopt.py | 2 +- tests/quantization/test_ptpc_fp8.py | 3 ++- .../test_register_quantization_config.py | 6 +++--- tests/quantization/test_torchao.py | 2 +- 10 files changed, 32 insertions(+), 26 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index e7d902ed26aa..31b65189b5ec 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor | None): "neuralmagic/Llama-3.2-1B-quantized.w8a8", ], ) -@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize( "use_aiter", [True, False] if current_platform.is_rocm() else [False] @@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs( example_prompts, max_tokens, num_logprobs ) - with vllm_runner(model_path, dtype=dtype) as vllm_model: + with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs ) diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index a3fb4a695347..1591ce1c4f5a 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -19,8 +19,8 @@ def test_cpu_offload_fp8(): # Test loading a quantized checkpoint compare_two_settings( "neuralmagic/Qwen2-1.5B-Instruct-FP8", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch): # Test GPTQ Marlin compare_two_settings( "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch): # Test AWQ Marlin compare_two_settings( "Qwen/Qwen2-1.5B-Instruct-AWQ", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch): # Test wNa16 compare_two_settings( "nm-testing/tinyllama-oneshot-w4a16-channel-v2", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index 2a72f734e431..b992e976ac30 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -21,7 +21,7 @@ ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [10]) +@pytest.mark.parametrize("max_tokens", [4]) def test_model_experts_int8_startup( hf_runner, vllm_runner, @@ -33,5 +33,7 @@ def test_model_experts_int8_startup( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_transformers_version(on_fail="skip") - with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model: + with vllm_runner( + model, dtype=dtype, enforce_eager=True, quantization="experts_int8" + ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index f02da2996ffe..7bcac9ad768e 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -45,10 +45,10 @@ def test_model_load_and_run( if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") - with vllm_runner(model_id) as llm: + with vllm_runner(model_id, enforce_eager=True) as llm: # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy - outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) + outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4) print(outputs[0][1]) @@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run( # `LLM.apply_model` requires pickling a function. monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: + with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm: def check_model(model): attn = model.model.layers[0].self_attn.attn @@ -112,7 +112,7 @@ def check_model(model): # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy - outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) + outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4) print(outputs[0][1]) @@ -142,7 +142,10 @@ def test_load_fp16_model( monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") with vllm_runner( - "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype + "facebook/opt-125m", + quantization="fp8", + enforce_eager=True, + kv_cache_dtype=kv_cache_dtype, ) as llm: def check_model(model): diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index ae9b1df3377d..4f3c52df6c28 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -26,7 +26,7 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", DTYPE) def test_ipex_quant(vllm_runner, model, dtype): - with vllm_runner(model, dtype=dtype) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm: + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output print(output) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index f009a4cfb870..d92dfaa2cc7b 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -49,4 +49,4 @@ def check_model(model): vllm_model.apply_model(check_model) - print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1]) + print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1]) diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 8abf65d29784..0298994c396f 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -88,6 +88,6 @@ def check_model(model): llm.apply_model(check_model) # Run a simple generation test to ensure the model works - output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index e8ea4148585b..61efd2ce66c7 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: "facebook/opt-125m", dtype=dtype, quantization="ptpc_fp8", + enforce_eager=True, kv_cache_dtype=kv_cache_dtype, ) except AssertionError as e: @@ -65,5 +66,5 @@ def check_model(model): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 8da048703df9..a09856c78559 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -23,8 +23,8 @@ get_quantization_config, register_quantization_config, ) -from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 - QuantizationConfig, +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, # noqa: E501 ) @@ -142,5 +142,5 @@ def check_model(model): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=1) assert output diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 82413f36e997..fb8d6130c377 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -392,7 +392,7 @@ def get_weight_attrs(model): assert not has_int4_preshuffled_tensor assert weight_attrs == [False, 1, 0, True] - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output From 49a986ecd445db2220e750b61ba653658ea3db9b Mon Sep 17 00:00:00 2001 From: Ido Segev Date: Tue, 18 Nov 2025 18:38:22 +0200 Subject: [PATCH 155/578] [Benchmark] multi_turn: Report warmup-inclusive runtime (#28937) Signed-off-by: Ido Segev --- benchmarks/multi_turn/README.md | 4 ++ .../benchmark_serving_multi_turn.py | 59 +++++++++++++++---- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index f5b5c6c97d48..b0be1e3a69a6 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 ---------------------------------------------------------------------------------------------------- ``` +If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec` +and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the +benchmark-only runtime so the reported throughput stays comparable). + ### JSON configuration file for synthetic conversations generation The input flag `--input-file` is used to determine the input conversations for the benchmark.
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 772d685ad90f..e23f6b923f1b 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -1076,6 +1076,7 @@ def process_statistics( verbose: bool, gen_conv_args: GenConvArgs | None = None, excel_output: bool = False, + warmup_runtime_sec: float | None = None, ) -> None: if len(client_metrics) == 0: logger.info("No samples to process") @@ -1169,8 +1170,13 @@ def process_statistics( # Convert milliseconds to seconds runtime_sec = runtime_sec / 1000.0 requests_per_sec = float(len(df)) / runtime_sec - - params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} + params = { + "runtime_sec": runtime_sec, + "requests_per_sec": requests_per_sec, + } + if warmup_runtime_sec is not None: + params["warmup_runtime_sec"] = warmup_runtime_sec + params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec # Generate a summary of relevant metrics (and drop irrelevant data) df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() @@ -1552,6 +1558,8 @@ async def main() -> None: url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop ) + warmup_runtime_sec: float | None = None + # Warm-up step if args.warmup_step: # Only send a single user prompt from every conversation. @@ -1566,26 +1574,56 @@ async def main() -> None: # all clients should finish their work before exiting warmup_bench_args = bench_args._replace(early_stop=False) - logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") + logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET) + warmup_start_ns = time.perf_counter_ns() conversations, _ = await main_mp( warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations ) - logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") + warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns) + logger.info( + "%sWarmup runtime: %.3f sec (%.3f ms)%s", + Color.PURPLE, + warmup_runtime_sec, + warmup_runtime_sec * 1000, + Color.RESET, + ) + logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET) # Run the benchmark - start_time = time.perf_counter_ns() + benchmark_start_ns = time.perf_counter_ns() client_convs, client_metrics = await main_mp( client_args, req_args, bench_args, tokenizer, conversations ) - total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) + benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns) # Calculate requests per second - total_runtime_sec = total_runtime_ms / 1000.0 - rps = len(client_metrics) / total_runtime_sec + requests_per_sec = len(client_metrics) / benchmark_runtime_sec + benchmark_runtime_ms = benchmark_runtime_sec * 1000.0 logger.info( - f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" - f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" + "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), " + "requests per second: %.3f%s", + Color.GREEN, + benchmark_runtime_sec, + benchmark_runtime_ms, + requests_per_sec, + Color.RESET, ) + if warmup_runtime_sec is not None: + total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec + logger.info( + "%sWarmup runtime: %.3f sec (%.3f ms)%s", + Color.GREEN, + warmup_runtime_sec, + warmup_runtime_sec * 1000, + Color.RESET, + ) + logger.info( + "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s", + Color.GREEN, + total_runtime_sec, + total_runtime_sec * 1000, + Color.RESET, + ) # Benchmark parameters params = { @@ -1610,6 +1648,7 @@ async def main() -> None: verbose=args.verbose, gen_conv_args=gen_conv_args, excel_output=args.excel_output, + warmup_runtime_sec=warmup_runtime_sec, ) if args.output_file is not None: From c2612371ad76a966cbbc443da3f3f91a4f4a3138 Mon Sep 17 00:00:00 2001 From: Luciano Martins Date: Tue, 18 Nov 2025 13:56:29 -0300 Subject: [PATCH 156/578] [Model] Add Gemma3 GGUF multimodal support (#27772) Signed-off-by: Luciano Martins Signed-off-by: Isotr0py Co-authored-by: Luciano Martins Co-authored-by: Isotr0py --- requirements/common.txt | 2 +- .../generation/test_multimodal_gguf.py | 115 +++++++++++ tests/models/quantization/test_gguf.py | 9 +- vllm/config/model.py | 20 +- .../layers/quantization/gguf.py | 67 ++++++- .../model_loader/gguf_loader.py | 186 ++++++++++++++++-- .../model_loader/weight_utils.py | 10 +- vllm/model_executor/models/gemma3_mm.py | 172 ++++++++++------ vllm/model_executor/models/siglip.py | 27 +++ vllm/transformers_utils/config.py | 11 ++ vllm/transformers_utils/gguf_utils.py | 166 ++++++++++++++++ vllm/transformers_utils/processor.py | 31 ++- vllm/transformers_utils/utils.py | 1 + vllm/v1/worker/gpu_model_runner.py | 19 ++ 14 files changed, 751 insertions(+), 85 deletions(-) create mode 100644 tests/models/multimodal/generation/test_multimodal_gguf.py create mode 100644 vllm/transformers_utils/gguf_utils.py diff --git a/requirements/common.txt b/requirements/common.txt index ad92ba3ad827..1058ab91a02a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -30,7 +30,7 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31 partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec -gguf >= 0.13.0 +gguf >= 0.17.0 mistral_common[image] >= 1.8.5 opencv-python-headless >= 4.11.0 # required for video IO pyyaml diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py new file mode 100644 index 000000000000..e596b20c6302 --- /dev/null +++ b/tests/models/multimodal/generation/test_multimodal_gguf.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Literal, NamedTuple + +import pytest +from huggingface_hub import hf_hub_download +from pytest import MarkDecorator + +from tests.quantization.utils import is_quant_method_supported +from vllm.assets.image import ImageAsset +from vllm.utils.torch_utils import set_default_torch_num_threads + +from ....conftest import PromptImageInput, VllmRunner +from ...utils import check_logprobs_close + + +class GGUFMMTestConfig(NamedTuple): + original_model: str + gguf_repo: str + gguf_backbone: str + gguf_mmproj: str + prompt: list[str] + mm_data: dict[Literal["images"], PromptImageInput] + max_model_len: int = 4096 + marks: list[MarkDecorator] = [] + + @property + def gguf_model(self): + hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj) + return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone) + + +GEMMA3_CONFIG = GGUFMMTestConfig( + original_model="google/gemma-3-4b-it", + gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf", + gguf_backbone="gemma-3-4b-it-q4_0.gguf", + gguf_mmproj="mmproj-model-f16-4B.gguf", + prompt=["Describe this image in detail:"], + mm_data={"images": [ImageAsset("stop_sign").pil_image]}, + marks=[pytest.mark.core_model], +) + +MODELS_TO_TEST = [GEMMA3_CONFIG] + + +def run_multimodal_gguf_test( + vllm_runner: type[VllmRunner], + model: GGUFMMTestConfig, + dtype: str, + max_tokens: int, + num_logprobs: int, +): + # Run gguf model. + with ( + set_default_torch_num_threads(1), + vllm_runner( + model_name=model.gguf_model, + enforce_eager=True, + tokenizer_name=model.original_model, + dtype=dtype, + max_model_len=model.max_model_len, + ) as gguf_model, + ): + gguf_outputs = gguf_model.generate_greedy_logprobs( + prompts=model.prompt, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + **model.mm_data, + ) + + # Run unquantized model. + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=model.max_model_len, + ) as original_model: + original_outputs = original_model.generate_greedy_logprobs( + prompts=model.prompt, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + **model.mm_data, + ) + + check_logprobs_close( + outputs_0_lst=original_outputs, + outputs_1_lst=gguf_outputs, + name_0="original", + name_1="gguf", + ) + + +@pytest.mark.skipif( + not is_quant_method_supported("gguf"), + reason="gguf is not supported on this GPU type.", +) +@pytest.mark.parametrize( + "model", + [ + pytest.param(test_config, marks=test_config.marks) + for test_config in MODELS_TO_TEST + ], +) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_models( + vllm_runner: type[VllmRunner], + model: GGUFMMTestConfig, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 5e2438857aee..3b9597507ac1 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -78,6 +78,12 @@ def gguf_model(self): gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", ) +GEMMA3_CONFIG = GGUFTestConfig( + original_model="google/gemma-3-270m-it", + gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF", + gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf", +) + MODELS = [ # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458 QWEN2_CONFIG, @@ -85,6 +91,7 @@ def gguf_model(self): GPT2_CONFIG, STABLELM_CONFIG, DOLPHIN_CONFIG, + GEMMA3_CONFIG, # STARCODER_CONFIG, # broken ] @@ -148,7 +155,7 @@ def check_model_outputs( "model", [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS], ) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("tp_size", [1]) diff --git a/vllm/config/model.py b/vllm/config/model.py index b3a28af6de38..49fe0bcd9a2a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -33,10 +33,14 @@ try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, + uses_custom_attention_masks, uses_mrope, ) +from vllm.transformers_utils.gguf_utils import ( + maybe_patch_hf_config_from_gguf, +) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri -from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect from vllm.utils.import_utils import LazyLoader from vllm.utils.torch_utils import common_broadcastable_dtype @@ -450,6 +454,12 @@ def __post_init__( self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: + if check_gguf_file(self.model): + raise ValueError( + "Using a tokenizer is mandatory when loading a GGUF model. " + "Please specify the tokenizer path or name using the " + "--tokenizer argument." + ) self.tokenizer = self.model if self.tokenizer_revision is None: self.tokenizer_revision = self.revision @@ -508,6 +518,10 @@ def __post_init__( hf_overrides_kw=hf_overrides_kw, hf_overrides_fn=hf_overrides_fn, ) + hf_config = maybe_patch_hf_config_from_gguf( + self.model, + hf_config, + ) self.hf_config = hf_config if dict_overrides: @@ -1605,6 +1619,10 @@ def uses_alibi(self) -> bool: def uses_mrope(self) -> bool: return uses_mrope(self.hf_config) + @property + def uses_custom_attention_masks(self) -> bool: + return uses_custom_attention_masks(self.hf_config) + @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index caabcd0ca0ee..42d7a67371ae 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable +from collections.abc import Callable, Mapping +from types import MappingProxyType from typing import Any, Optional import gguf @@ -26,7 +27,11 @@ QuantizationConfig, QuantizeMethodBase, ) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + UnquantizedEmbeddingMethod, + VocabParallelEmbedding, +) +from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs from vllm.utils.torch_utils import direct_register_custom_op @@ -65,18 +70,70 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: if isinstance(layer, LinearBase): - if is_layer_skipped_gguf(prefix, self.unquantized_modules): + if is_layer_skipped_gguf( + prefix, self.unquantized_modules, self.packed_modules_mapping + ): return UnquantizedLinearMethod() return GGUFLinearMethod(self) elif isinstance(layer, VocabParallelEmbedding): + if is_layer_skipped_gguf( + prefix, self.unquantized_modules, self.packed_modules_mapping + ): + return UnquantizedEmbeddingMethod() return GGUFEmbeddingMethod(self) elif isinstance(layer, FusedMoE): return GGUFMoEMethod(self, layer.moe_config) return None + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + """ + Interface for models to update module names referenced in + quantization configs in order to reflect the vllm model structure + + :param hf_to_vllm_mapper: maps from hf model structure (the assumed + structure of the qconfig) to vllm model structure + """ + if self.unquantized_modules is not None: + self.unquantized_modules = hf_to_vllm_mapper.apply_list( + self.unquantized_modules + ) + + +def is_layer_skipped_gguf( + prefix: str, + unquantized_modules: list[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), +): + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + proj_name = prefix.split(".")[-1] + if proj_name in fused_mapping: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in fused_mapping[proj_name] + ] + + is_skipped = None + for shard_prefix in shard_prefixes: + is_shard_skipped = any( + shard_prefix in module_name for module_name in unquantized_modules + ) + + if is_skipped is None: + is_skipped = is_shard_skipped + elif is_shard_skipped != is_skipped: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision." + ) + else: + is_skipped = any(module_name in prefix for module_name in unquantized_modules) -def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]): - return any(module_name in prefix for module_name in unquantized_modules) + assert is_skipped is not None + return is_skipped UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 7db1fc167c4f..2416836be03c 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -7,10 +7,11 @@ import torch import torch.nn as nn from huggingface_hub import hf_hub_download -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, AutoModelForImageTextToText from vllm.config import ModelConfig, VllmConfig from vllm.config.load import LoadConfig +from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.utils import ( initialize_model, @@ -21,8 +22,11 @@ get_gguf_weight_type_map, gguf_quant_weights_iterator, ) +from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal from vllm.utils.torch_utils import set_default_torch_dtype +logger = init_logger(__name__) + class GGUFModelLoader(BaseModelLoader): """ @@ -67,7 +71,15 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details. """ config = model_config.hf_config + # Get text config to handle both nested (multimodal) and flat + # (text-only) config structures. For multimodal models like + # Gemma3Config, this returns config.text_config. For text-only + # models, this returns config itself. + text_config = config.get_text_config() model_type = config.model_type + is_multimodal = ( + hasattr(config, "vision_config") and config.vision_config is not None + ) gguf_to_hf_name_map = {} # hack: ggufs have a different name than transformers if model_type == "cohere": @@ -115,24 +127,167 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): break if arch is None: raise RuntimeError(f"Unknown gguf model_type: {model_type}") - num_layers = config.num_hidden_layers - name_map = gguf.get_tensor_name_map(arch, num_layers) + text_num_layers = text_config.num_hidden_layers + text_name_map = gguf.get_tensor_name_map(arch, text_num_layers) + + if is_multimodal: + mm_proj_arch = gguf.MODEL_ARCH.MMPROJ + vision_num_layers = config.vision_config.num_hidden_layers + vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers) + else: + vision_name_map = None + + # Create dummy model to extract parameter names + # For multimodal: use AutoModelForImageTextToText to get + # language + vision + projector params + # For text-only: use AutoModelForCausalLM to get language model params + auto_cls = ( + AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM + ) with torch.device("meta"): - dummy_model = AutoModelForCausalLM.from_config( + dummy_model = auto_cls.from_config( config, trust_remote_code=model_config.trust_remote_code ) + state_dict = dummy_model.state_dict() + if hf_checkpoint_map := getattr( + dummy_model, "_checkpoint_conversion_mapping", None + ): + + def revert_hf_rename(name: str) -> str: + for original_name, hf_name in hf_checkpoint_map.items(): + if hf_name in name: + name = name.replace(hf_name, original_name).lstrip("^") + return name + + state_dict = { + revert_hf_rename(name): tensor for name, tensor in state_dict.items() + } + + def find_hf_name_in_tensor_map(hf_name: str) -> str | None: + """ + Map HuggingFace parameter name to GGUF tensor name. + + This function handles the mismatch between HF parameter naming + conventions and gguf-py's expected format: + 1. Strips 'model.' prefix (common in multimodal models) + 2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility) + 3. Searches vision_name_map for multimodal parameters + 4. Falls back to text_name_map for language model parameters + + Args: + hf_name: Full HuggingFace parameter name (e.g., + 'model.multi_modal_projector.mm_soft_emb_norm.weight') + + Returns: + GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight') + or None if no mapping found + """ + # Strip 'language_model.' prefix for multimodal models - gguf-py + # tensor mappings expect parameter names without this prefix. + # Note: 'model.' prefix should be KEPT for text-only models as + # gguf-py expects it. + if hf_name.startswith("language_model."): + hf_name = hf_name[15:] # Remove 'language_model.' + + # Parse parameter name and suffix + if hf_name.endswith((".weight", ".bias")): + base_name, suffix = hf_name.rsplit(".", 1) + else: + base_name, suffix = hf_name, "" + # Handle '_weight' suffix (Gemma3 naming: parameter ends with + # '_weight' instead of '.weight') + if base_name.endswith("_weight"): + base_name = base_name[:-7] # Remove '_weight' + suffix = "weight" + + gguf_name = None + # Priority 1: Search vision/projector parameters for multimodal models + if vision_name_map is not None: + gguf_name = vision_name_map.get_name(base_name) + + # Priority 2: Search text backbone parameters + if gguf_name is None: + gguf_name = text_name_map.get_name(base_name) + + if gguf_name is None: + return None + return gguf_name + "." + suffix + + # Build mapping and track unmapped parameters + unmapped_params = [] for hf_name in state_dict: - name, suffix = hf_name.rsplit(".", 1) - gguf_name = name_map.get_name(name) - gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name + gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name) + + # Track mapping success + if gguf_name_with_suffix is not None: + gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name + logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name) + elif hf_name not in gguf_to_hf_name_map.values(): + # Parameter not in manual overrides either + unmapped_params.append(hf_name) + + # All parameters must be mapped: both vision/projector and backbone + if unmapped_params: + raise RuntimeError( + f"Failed to map GGUF parameters " + f"({len(unmapped_params)}): " + f"{unmapped_params}" + ) return gguf_to_hf_name_map + def _get_gguf_weight_type( + self, + model_config: ModelConfig, + model_name_or_path: str, + gguf_to_hf_name_map: dict[str, str], + ) -> dict[str, str]: + weight_type_map = get_gguf_weight_type_map( + model_config.model, gguf_to_hf_name_map + ) + is_multimodal = hasattr(model_config.hf_config, "vision_config") + if is_multimodal: + mmproj_file = detect_gguf_multimodal(model_name_or_path) + assert mmproj_file is not None, ( + "Could not find mm_proj file for multimodal GGUF model" + ) + logger.info("Loading extra mm_proj weights from %s...", mmproj_file) + mm_proj_weight_type_map = get_gguf_weight_type_map( + mmproj_file, gguf_to_hf_name_map + ) + weight_type_map.update(mm_proj_weight_type_map) + return weight_type_map + def _get_weights_iterator( - self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] + self, + model_config: ModelConfig, + model_name_or_path: str, + gguf_to_hf_name_map: dict[str, str], ) -> Generator[tuple[str, torch.Tensor], None, None]: - return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) + """ + Iterate over GGUF model weights, loading from both main model file and + mmproj.gguf for multimodal Gemma3 models. + + For Gemma3 multimodal GGUF models: + - Main file (gemma-3-*.gguf): Language model weights (model.*) + - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*) + + Yields: + Tuples of (parameter_name, tensor) for all model weights + """ + hf_config = model_config.hf_config + is_multimodal = hasattr(hf_config, "vision_config") + + if is_multimodal: + # Load mm_proj (mm_encoder + projector) for multimodal weights + mmproj_file = detect_gguf_multimodal(model_name_or_path) + assert mmproj_file is not None, ( + "Could not find mm_proj file for multimodal GGUF model" + ) + yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map) + + yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model) @@ -141,7 +296,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: local_model_path = self._prepare_weights(model_config.model) gguf_weights_map = self._get_gguf_weights_map(model_config) model.load_weights( - self._get_weights_iterator(local_model_path, gguf_weights_map) + self._get_weights_iterator(model_config, local_model_path, gguf_weights_map) ) def load_model( @@ -156,14 +311,19 @@ def load_model( ): model_config.hf_config.update({"tie_word_embeddings": True}) - weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map) - + weight_type_map = self._get_gguf_weight_type( + model_config, local_model_path, gguf_weights_map + ) # filter out unquantized modules to skip unquant_names = [ name.removesuffix(".weight") for name, weight_type in weight_type_map.items() - if weight_type == "F32" and name.endswith(".weight") + if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight") ] + logger.debug( + "GGUF unquantized modules: %s", + unquant_names, + ) vllm_config.quant_config.unquantized_modules.extend(unquant_names) target_device = torch.device(device_config.device) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 93986e5f2fc0..89634cbf4124 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -836,7 +836,11 @@ def gguf_quant_weights_iterator( ) -> Generator[tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert - them to torch tensors + them to torch tensors. + Be careful of the order of yielding weight types and weights data, + we have to yield all weight types first before yielding any weights. + Otherwise it would cause issue when loading weights with for packed + layer with different quant types. """ reader = gguf.GGUFReader(gguf_file) @@ -846,7 +850,7 @@ def gguf_quant_weights_iterator( weight_type = tensor.tensor_type name = gguf_to_hf_name_map[tensor.name] - if weight_type.name != "F32": + if weight_type.name not in ("F32", "BF16", "F16"): weight_type_name = name.replace("weight", "qweight_type") weight_type = torch.tensor(weight_type) yield weight_type_name, weight_type @@ -856,7 +860,7 @@ def gguf_quant_weights_iterator( weight = tensor.data weight_type = tensor.tensor_type name = gguf_to_hf_name_map[tensor.name] - if weight_type.name != "F32": + if weight_type.name not in ("F32", "BF16", "F16"): name = name.replace("weight", "qweight") param = torch.tensor(weight) yield name, param diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 8e2bbe8f7990..fe83c8b63b01 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Any, Literal, TypeAlias +from typing import Annotated, Any, Literal import torch from torch import nn @@ -20,12 +20,7 @@ MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import ( - ImageEmbeddingItems, - ImageProcessorItems, - ImageSize, - MultiModalDataItems, -) +from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems from vllm.multimodal.processing import ( BaseMultiModalProcessor, BaseProcessingInfo, @@ -76,15 +71,7 @@ class Gemma3ImagePixelInputs(TensorSchema): num_patches: Annotated[torch.Tensor, TensorShape("bn")] -class Gemma3ImageEmbeddingInputs(TensorSchema): - type: Literal["image_embeds"] = "image_embeds" - image_embeds: Annotated[ - torch.Tensor, - TensorShape("ni", "nf", "hs"), - ] - - -Gemma3ImageInputs: TypeAlias = Gemma3ImagePixelInputs | Gemma3ImageEmbeddingInputs +Gemma3ImageInputs = Gemma3ImagePixelInputs class Gemma3ProcessingInfo(BaseProcessingInfo): @@ -191,9 +178,8 @@ def get_num_crops( def get_image_repl( self, *, - image_width: int | None, - image_height: int | None, - num_crops: int | None = None, + image_width: int, + image_height: int, processor: Gemma3Processor | None, ) -> PromptUpdateDetails[str]: if processor is None: @@ -201,13 +187,11 @@ def get_image_repl( boi_token = processor.boi_token - if num_crops is None: - assert image_width is not None and image_height is not None - num_crops = self.get_num_crops( - image_width=image_width, - image_height=image_height, - processor=processor, - ) + num_crops = self.get_num_crops( + image_width=image_width, + image_height=image_height, + processor=processor, + ) if num_crops == 0: image_text = boi_token @@ -337,7 +321,6 @@ def _get_mm_fields_config( return dict( pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches), num_patches=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), ) def _get_prompt_updates( @@ -350,19 +333,7 @@ def _get_prompt_updates( image_token = hf_processor.boi_token def get_replacement_gemma3(item_idx: int): - images = mm_items.get_items( - "image", (ImageEmbeddingItems, ImageProcessorItems) - ) - - if isinstance(images, ImageEmbeddingItems): - # For image embedding inputs, only support no crops cases - # since it's not supported in hf processor anyway - return self.info.get_image_repl( - image_width=None, - image_height=None, - num_crops=0, - processor=hf_processor, - ) + images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) return self.info.get_image_repl( @@ -586,19 +557,17 @@ def _parse_and_validate_image_input( pixel_values = kwargs.pop("pixel_values", None) num_patches = kwargs.pop("num_patches", None) image_embeds = kwargs.pop("image_embeds", None) + assert image_embeds is None, "Gemma3 does not support image_embeds." + if pixel_values is None: + return None - if pixel_values is not None: - image_size = self.config.vision_config.image_size - return Gemma3ImagePixelInputs( - pixel_values=pixel_values, - num_patches=num_patches, - resolve_bindings={"h": image_size, "w": image_size}, - ) - elif image_embeds is not None: - return Gemma3ImageEmbeddingInputs( - image_embeds=image_embeds, - type="image_embeds", - ) + image_size = self.config.vision_config.image_size + + return Gemma3ImagePixelInputs( + pixel_values=pixel_values, + num_patches=num_patches, + resolve_bindings={"h": image_size, "w": image_size}, + ) def _image_pixels_to_features( self, @@ -610,9 +579,7 @@ def _image_pixels_to_features( def _process_image_input( self, image_input: Gemma3ImageInputs, - ) -> torch.Tensor | list[torch.Tensor]: - if image_input["type"] == "image_embeds": - return image_input["image_embeds"] + ) -> list[torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input["pixel_values"] @@ -629,13 +596,33 @@ def _process_image_input( def get_language_model(self) -> torch.nn.Module: return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: + def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return [] return self._process_image_input(image_input) + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = True, + ) -> torch.Tensor: + # Early return for text-only inference (no multimodal data) + if multimodal_embeddings is None or is_multimodal is None: + return super().embed_input_ids(input_ids) + + # Use interface default with OOV handling enabled + return super().embed_input_ids( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + def forward( self, input_ids: torch.Tensor, @@ -657,6 +644,79 @@ def forward( return hidden_states + def generate_attention_masks( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + mask_dtype: torch.dtype, + ) -> dict[str, Any]: + """Generate custom attention masks for Gemma3 multimodal inputs. + + This is called by V1 engine's gpu_model_runner during preprocessing + to generate attention masks that allow bidirectional attention between + image tokens while maintaining causal attention for text. + """ + # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. + # This is a HACK. Fix this. + start_indices = (positions == 0).cpu().nonzero() + num_seqs = len(start_indices) + seq_lens = [] + for i in range(num_seqs): + start_idx = start_indices[i] + end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids) + seq_lens.append(end_idx - start_idx) + + global_attn_masks = [] + local_attn_masks = [] + start_idx = 0 + for seq_idx, seq_len in enumerate(seq_lens): + end_idx = start_idx + seq_len + input_token_ids = input_ids[start_idx:end_idx] + + # Find image token positions + img_pos = input_token_ids == self.config.image_token_index + + start_idx = end_idx + + # Create a global causal mask + global_attn_mask = torch.empty( + 1, + 1, + seq_len, + seq_len, + dtype=mask_dtype, + device=input_ids.device, + ) + global_attn_mask.fill_(float("-inf")) + # Fill the lower triangle with 0 (causal attention) + global_attn_mask = global_attn_mask.triu(diagonal=1) + + # Enable bidirectional attention between image tokens + img_mask = torch.zeros_like(global_attn_mask) + img_mask[:, :, :, img_pos] += 1 + img_mask[:, :, img_pos, :] += 1 + global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) + global_attn_masks.append(global_attn_mask) + + # GGUF compatibility: config might be Gemma3TextConfig directly + text_config = getattr(self.config, "text_config", self.config) + sliding_window = text_config.sliding_window + if sliding_window is not None: + # Create a local causal mask with sliding window (1024) + local_attn_mask = torch.ones_like(global_attn_mask) + local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window) + local_attn_mask = torch.where( + local_attn_mask == 0, global_attn_mask, float("-inf") + ) + local_attn_masks.append(local_attn_mask) + + return { + "has_images": True, + "seq_lens": seq_lens, + "global_attn_masks": global_attn_masks, + "local_attn_masks": local_attn_masks, + } + def prepare_attn_masks( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index b175dd60cf65..42d906d089f9 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -827,6 +827,7 @@ def __init__( ) -> None: super().__init__() + self.quant_config = quant_config self.vision_model = SiglipVisionTransformer( config, quant_config, @@ -911,12 +912,38 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: break else: param = params_dict[name] + param = maybe_swap_ffn_param( + name, param, loaded_weight, params_dict, self.quant_config + ) weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params +def maybe_swap_ffn_param( + name: str, + param: torch.Tensor, + loaded_weight: torch.Tensor, + params_dict: dict[str, torch.Tensor], + quant_config: QuantizationConfig, +) -> torch.Tensor: + if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name: + return param + # Some GGUF models have fc1 and fc2 weights swapped + tp_size = get_tensor_model_parallel_world_size() + output_dim = getattr(param, "output_dim", 0) + output_size = param.size(output_dim) * tp_size + weight_out_size = loaded_weight.size(output_dim) + if ".fc1." in name and output_size != weight_out_size: + new_name = name.replace(".fc1.", ".fc2.") + param = params_dict[new_name] + elif ".fc2." in name and output_size != weight_out_size: + new_name = name.replace(".fc2.", ".fc1.") + param = params_dict[new_name] + return param + + # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200 class SiglipTextEmbeddings(nn.Module): def __init__(self, config: SiglipTextConfig): diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 49250e071eab..ac4a71648cec 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -477,6 +477,17 @@ def is_interleaved(config: PretrainedConfig) -> bool: return False +def uses_custom_attention_masks(config: PretrainedConfig) -> bool: + """Detect if model uses custom attention mask generation for multimodal. + + Some multimodal models require custom attention masks that enable + bidirectional attention between image tokens while maintaining causal + attention for text tokens. Currently applies to Gemma3 multimodal models. + """ + architectures = getattr(config, "architectures", []) + return "Gemma3ForConditionalGeneration" in architectures + + def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str): """ Update kwargs for AutoConfig initialization based on model_type diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py new file mode 100644 index 000000000000..2bf59c91a3bb --- /dev/null +++ b/vllm/transformers_utils/gguf_utils.py @@ -0,0 +1,166 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""GGUF utility functions.""" + +from pathlib import Path + +import gguf +from gguf.constants import Keys, VisionProjectorType +from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def detect_gguf_multimodal(model: str) -> Path | None: + """Check if GGUF model has multimodal projector file. + + Args: + model: Model path string + + Returns: + Path to mmproj file if found, None otherwise + """ + if not model.endswith(".gguf"): + return None + + try: + model_path = Path(model) + if not model_path.is_file(): + return None + + model_dir = model_path.parent + mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"] + for pattern in mmproj_patterns: + mmproj_files = list(model_dir.glob(pattern)) + if mmproj_files: + return mmproj_files[0] + return None + except Exception: + return None + + +def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None": + """Extract vision config parameters from mmproj.gguf metadata. + + Reads vision encoder configuration from GGUF metadata fields using + standardized GGUF constants. Automatically detects the projector type + (e.g., gemma3, llama4) and applies model-specific parameters accordingly. + + The function extracts standard CLIP vision parameters from GGUF metadata + and applies projector-type-specific customizations. For unknown projector + types, it uses safe defaults from SiglipVisionConfig. + + Args: + mmproj_path: Path to mmproj.gguf file (str or Path) + + Returns: + SiglipVisionConfig if extraction succeeds, None if any required + field is missing from the GGUF metadata + + Raises: + Exception: Exceptions from GGUF reading (file not found, corrupted + file, etc.) propagate directly from gguf.GGUFReader + """ + reader = gguf.GGUFReader(str(mmproj_path)) + + # Detect projector type to apply model-specific parameters + projector_type = None + projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE) + if projector_type_field: + try: + projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8") + except (AttributeError, UnicodeDecodeError) as e: + logger.warning("Failed to decode projector type from GGUF: %s", e) + + # Map GGUF field constants to SiglipVisionConfig parameters. + # Uses official GGUF constants from gguf-py for standardization. + # Format: {gguf_constant: (param_name, dtype)} + VISION_CONFIG_FIELDS = { + Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int), + Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int), + Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int), + Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int), + Keys.ClipVision.IMAGE_SIZE: ("image_size", int), + Keys.ClipVision.PATCH_SIZE: ("patch_size", int), + Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float), + } + + # Extract and validate all required fields + config_params = {} + for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items(): + field = reader.get_field(gguf_key) + if field is None: + logger.warning( + "Missing required vision config field '%s' in mmproj.gguf", + gguf_key, + ) + return None + # Extract scalar value from GGUF field and convert to target type + config_params[param_name] = dtype(field.parts[-1]) + + # Apply model-specific parameters based on projector type + if projector_type == VisionProjectorType.GEMMA3: + # Gemma3 doesn't use the vision pooling head (multihead attention) + # This is a vLLM-specific parameter used in SiglipVisionTransformer + config_params["vision_use_head"] = False + logger.info("Detected Gemma3 projector, disabling vision pooling head") + # Add other projector-type-specific customizations here as needed + # elif projector_type == VisionProjectorType.LLAMA4: + # config_params["vision_use_head"] = ... + + # Create config with extracted parameters + # Note: num_channels and attention_dropout use SiglipVisionConfig defaults + # (3 and 0.0 respectively) which are correct for all models + config = SiglipVisionConfig(**config_params) + + if projector_type: + logger.info( + "Extracted vision config from mmproj.gguf (projector_type: %s)", + projector_type, + ) + else: + logger.info("Extracted vision config from mmproj.gguf metadata") + + return config + + +def maybe_patch_hf_config_from_gguf( + model: str, + hf_config: PretrainedConfig, +) -> PretrainedConfig: + """Patch HF config for GGUF models. + + Applies GGUF-specific patches to HuggingFace config: + 1. For multimodal models: patches architecture and vision config + 2. For all GGUF models: overrides vocab_size from embedding tensor + + This ensures compatibility with GGUF models that have extended + vocabularies (e.g., Unsloth) where the GGUF file contains more + tokens than the HuggingFace tokenizer config specifies. + + Args: + model: Model path string + hf_config: HuggingFace config to patch in-place + + Returns: + Updated HuggingFace config + """ + # Patch multimodal config if mmproj.gguf exists + mmproj_path = detect_gguf_multimodal(model) + if mmproj_path is not None: + vision_config = extract_vision_config_from_gguf(str(mmproj_path)) + + # Create HF config for Gemma3 multimodal + text_config = hf_config.get_text_config() + is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text") + if vision_config is not None and is_gemma3: + new_hf_config = Gemma3Config.from_text_vision_configs( + text_config=text_config, + vision_config=vision_config, + architectures=["Gemma3ForConditionalGeneration"], + ) + hf_config = new_hf_config + + return hf_config diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index b3469c1b18f2..8deacb5b0791 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -18,7 +18,7 @@ from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar -from vllm.transformers_utils.utils import convert_model_repo_to_path +from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: @@ -236,9 +236,20 @@ def cached_processor_from_config( processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: + if check_gguf_file(model_config.model): + assert not check_gguf_file(model_config.tokenizer), ( + "For multimodal GGUF models, the original tokenizer " + "should be used to correctly load processor." + ) + model = model_config.tokenizer + revision = model_config.tokenizer_revision + else: + model = model_config.model + revision = model_config.revision + return cached_get_processor_without_dynamic_kwargs( - model_config.model, - revision=model_config.revision, + model, + revision=revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] **_merge_mm_kwargs(model_config, processor_cls, **kwargs), @@ -339,9 +350,19 @@ def cached_image_processor_from_config( model_config: "ModelConfig", **kwargs: Any, ): + if check_gguf_file(model_config.model): + assert not check_gguf_file(model_config.tokenizer), ( + "For multimodal GGUF models, the original tokenizer " + "should be used to correctly load image processor." + ) + model = model_config.tokenizer + revision = model_config.tokenizer_revision + else: + model = model_config.model + revision = model_config.revision return cached_get_image_processor( - model_config.model, - revision=model_config.revision, + model, + revision=revision, trust_remote_code=model_config.trust_remote_code, **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 1ae42ba622dc..901a64d9d263 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -27,6 +27,7 @@ def is_cloud_storage(model_or_path: str) -> bool: return is_s3(model_or_path) or is_gcs(model_or_path) +@cache def check_gguf_file(model: str | PathLike) -> bool: """Check if the file is a GGUF model.""" model = Path(model) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0102ca4739ad..67f575f92cc6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -324,6 +324,7 @@ def __init__( # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope + self.uses_custom_attention_masks = model_config.uses_custom_attention_masks self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( model_config ) @@ -2346,6 +2347,24 @@ def _preprocess( **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } + + # Generate custom attention masks for models that require them. + # V1 pre-generates embeddings, so forward() skips prepare_attn_masks(). + # Check mm_features (mm_embeds is empty during decode). + has_mm_features = any( + req_state.mm_features for req_state in self.requests.values() + ) + if ( + self.uses_custom_attention_masks + and has_mm_features + and hasattr(self.model, "generate_attention_masks") + ): + mask_kwargs = self.model.generate_attention_masks( + self.input_ids.gpu[:num_scheduled_tokens], + self.positions.gpu[:num_scheduled_tokens], + mask_dtype=self.model.dtype, + ) + model_kwargs.update(mask_kwargs) elif self.enable_prompt_embeds and is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. From f226a3f0c11aed72f585ebd2942d4a6832adbfb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 18 Nov 2025 18:22:30 +0100 Subject: [PATCH 157/578] [CI][NIXL] Change default `block_size` for tests (#28927) Signed-off-by: NickLucche --- tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index ebc8575e5b39..87c9a105e936 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -49,8 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2} -PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16} -DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16} +PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128} +DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) From da8dadf68b5a2af849e7c5fd35ce9b8525d8d398 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 18 Nov 2025 09:26:07 -0800 Subject: [PATCH 158/578] [Minor] Rename `ec_producer` field to `is_ec_producer` (#28884) Signed-off-by: Nick Hill --- vllm/v1/engine/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index d49eb752d56a..3a25827cec38 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -180,7 +180,7 @@ def __init__( logger.info("Batch queue is enabled with size %d", self.batch_queue_size) self.batch_queue = deque(maxlen=self.batch_queue_size) - self.ec_producer = ( + self.is_ec_producer = ( vllm_config.ec_transfer_config is not None and vllm_config.ec_transfer_config.is_ec_producer ) @@ -390,7 +390,7 @@ def step_with_batch_queue( exec_future = self.model_executor.execute_model( scheduler_output, non_block=True ) - if not self.ec_producer: + if not self.is_ec_producer: model_executed = scheduler_output.total_num_scheduled_tokens > 0 if self.is_pooling_model or not model_executed: From 0af3d4f0df360decc2115f43f5e4bc732342e7e4 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 19 Nov 2025 01:28:34 +0800 Subject: [PATCH 159/578] =?UTF-8?q?[FEAT]=20[AITER]=20[ROCm]=20integrate?= =?UTF-8?q?=20aiter=C2=A0sampling=C2=A0ops=20(#26084)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: vllmellm --- vllm/v1/sample/ops/topk_topp_sampler.py | 77 +++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 02ea658b7f20..c6c7e924175f 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -7,6 +7,7 @@ from packaging import version from vllm import envs +from vllm._aiter_ops import rocm_aiter_ops from vllm.config.model import LogprobsMode from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform @@ -55,6 +56,17 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None: self.forward = self.forward_native else: self.forward = self.forward_cpu + elif ( + logprobs_mode not in ("processed_logits", "processed_logprobs") + and rocm_aiter_ops.is_enabled() + ): + import aiter.ops.sampling # noqa: F401 + + self.aiter_ops = torch.ops.aiter + logger.info_once( + "Using aiter sampler on ROCm (lazy import, sampling-only)." + ) + self.forward = self.forward_hip else: self.forward = self.forward_native @@ -138,6 +150,64 @@ def forward_cpu( return probs.div_(q).argmax(dim=-1).view(-1), logits_to_return + def forward_hip( + self, + logits: torch.Tensor, + generators: dict[int, torch.Generator], + k: torch.Tensor | None, + p: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Optimized ROCm/aiter path (same structure as forward_cuda).""" + if (k is None and p is None) or generators: + if generators: + logger.warning_once( + "aiter sampler does not support per-request generators; " + "falling back to PyTorch-native." + ) + return self.forward_native(logits, generators, k, p) + assert self.logprobs_mode not in ( + "processed_logits", + "processed_logprobs", + ), "aiter sampler does not support returning logits/logprobs." + return self.aiter_sample(logits, k, p, generators), None + + def aiter_sample( + self, + logits: torch.Tensor, + k: torch.Tensor | None, + p: torch.Tensor | None, + generators: dict[int, torch.Generator], + ) -> torch.Tensor: + """Sample from logits using aiter ops.""" + use_top_k = k is not None + use_top_p = p is not None + # Joint k+p path + if use_top_p and use_top_k: + probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous() + next_token_ids = self.aiter_ops.top_k_top_p_sampling_from_probs( + probs, + None, + *_to_tensor_scalar_tuple(k), + *_to_tensor_scalar_tuple(p), + deterministic=True, + ) + return next_token_ids.view(-1) + # Top-p only path + elif use_top_p: + probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous() + next_token_ids = self.aiter_ops.top_p_sampling_from_probs( + probs, None, *_to_tensor_scalar_tuple(p), deterministic=True + ) + return next_token_ids.view(-1) + # Top-k only path + elif use_top_k: + probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous() + renorm_probs = self.aiter_ops.top_k_renorm_probs( + probs, *_to_tensor_scalar_tuple(k) + ) + return torch.multinomial(renorm_probs, num_samples=1).view(-1) + raise RuntimeError("aiter_sample was called with no active top-k or top-p.") + # Note: this is a workaround for # https://github.com/pytorch/pytorch/pull/151218 @@ -288,3 +358,10 @@ def flashinfer_sample( ) return next_token_ids.view(-1) + + +def _to_tensor_scalar_tuple(x): + if isinstance(x, torch.Tensor): + return (x, 0) + else: + return (None, x) From c64c0b78de4716ef019666663c56b6ceaa019463 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 18 Nov 2025 09:44:18 -0800 Subject: [PATCH 160/578] [chore] Move the rest of wikimedia url to S3 (#28921) Signed-off-by: Kevin H. Luu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/features/multimodal_inputs.md | 2 +- docs/serving/openai_compatible_server.md | 2 +- examples/offline_inference/vision_language_pooling.py | 4 ++-- .../openai_chat_completion_client_for_multimodal.py | 2 +- .../openai_chat_embedding_client_for_multimodal.py | 2 +- tests/entrypoints/openai/test_vision.py | 8 ++++---- tests/entrypoints/pooling/openai/test_vision_embedding.py | 8 ++++---- .../language/pooling/test_mm_classifier_conversion.py | 2 +- tests/multimodal/test_utils.py | 8 ++++---- tests/utils.py | 2 +- .../v1/entrypoints/openai/serving_responses/test_image.py | 8 ++++---- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cde2ec165712..5f684604e603 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -483,7 +483,7 @@ Then, you can use the OpenAI client as follows: ) # Single-image input inference - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 821628e6e317..23df3963823a 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -293,7 +293,7 @@ and passing a list of `messages` in the request. Refer to the examples below for base_url="http://localhost:8000/v1", api_key="EMPTY", ) - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" response = create_chat_embeddings( client, diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index 63d85d5d9eef..530aad4bc031 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -266,7 +266,7 @@ def get_query(modality: QueryModality): return ImageQuery( modality="image", image=fetch_image( - "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 + "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501 ), ) @@ -275,7 +275,7 @@ def get_query(modality: QueryModality): modality="text+image", text="A cat standing in the snow.", image=fetch_image( - "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 + "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" # noqa: E501 ), ) diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 520cbca003aa..3d1259276998 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -66,7 +66,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None: # Single-image input inference def run_single_image(model: str, max_completion_tokens: int) -> None: ## Use image url in the payload - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" chat_completion_from_url = client.chat.completions.create( messages=[ { diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 261b810ce5d0..47c2c5030078 100644 --- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -21,7 +21,7 @@ openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" +image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" def create_chat_embeddings( diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 2a7df08ea3b0..d83c6726e72d 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -17,10 +17,10 @@ # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ - "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] EXPECTED_MM_BEAM_SEARCH_RES = [ diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py index 944392d66fa5..1befb5a3cf7a 100644 --- a/tests/entrypoints/pooling/openai/test_vision_embedding.py +++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py @@ -19,10 +19,10 @@ # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ - "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py index 2482452645ef..a31a771238e2 100644 --- a/tests/models/language/pooling/test_mm_classifier_conversion.py +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -75,7 +75,7 @@ def test_gemma_multimodal( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg" + "url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/red_chair.jpg" }, }, {"type": "text", "text": "A fine 19th century piece of furniture."}, diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index ea795fcbbde5..639e290406fe 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -16,10 +16,10 @@ # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ - "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] TEST_VIDEO_URLS = [ diff --git a/tests/utils.py b/tests/utils.py index c8f18384c511..c31a2aeeb9c8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -676,7 +676,7 @@ def compare_all_settings( results += _test_image_text( client, model, - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", + "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ) elif method == "encode": results += _test_embeddings(client, model, prompt) diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py index 980d83b787e7..be5693bbf273 100644 --- a/tests/v1/entrypoints/openai/serving_responses/test_image.py +++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py @@ -15,10 +15,10 @@ MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ - "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] From e4bb2684bcea12f72a36a6c48292f79534af849a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 19 Nov 2025 02:56:04 +0800 Subject: [PATCH 161/578] [Models] Replace all `nn.Conv2d` with vLLM's Conv2dLayer (#28842) Signed-off-by: Isotr0py --- vllm/model_executor/layers/conv.py | 24 +++++++++++++-- vllm/model_executor/models/aimv2.py | 3 +- vllm/model_executor/models/blip.py | 3 +- vllm/model_executor/models/chameleon.py | 29 +++++++++---------- vllm/model_executor/models/deepencoder.py | 13 +++++---- vllm/model_executor/models/dots_ocr.py | 3 +- vllm/model_executor/models/glm4_1v.py | 4 +-- vllm/model_executor/models/glm4v.py | 5 ++-- .../models/idefics2_vision_model.py | 3 +- vllm/model_executor/models/intern_vit.py | 3 +- vllm/model_executor/models/interns1_vit.py | 3 +- vllm/model_executor/models/keye.py | 3 +- vllm/model_executor/models/midashenglm.py | 3 +- vllm/model_executor/models/moonvit.py | 3 +- vllm/model_executor/models/paddleocr_vl.py | 3 +- vllm/model_executor/models/pixtral.py | 5 ++-- vllm/model_executor/models/qwen_vl.py | 3 +- vllm/model_executor/models/siglip.py | 3 +- vllm/model_executor/models/siglip2navit.py | 5 ++-- vllm/model_executor/models/step3_vl.py | 7 +++-- 20 files changed, 83 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py index e6f2d2990c24..8d51e5bd9920 100644 --- a/vllm/model_executor/layers/conv.py +++ b/vllm/model_executor/layers/conv.py @@ -3,6 +3,7 @@ """Conv Layer Class.""" import math +from typing import Literal import torch import torch.nn as nn @@ -23,11 +24,11 @@ def __init__( out_channels: int, kernel_size: int | tuple[int, ...], stride: int | tuple[int, ...] = 1, - padding: int | tuple[int, ...] = 0, + padding: int | tuple[int, ...] | Literal["same", "valid"] = 0, dilation: int | tuple[int, ...] = 1, groups: int = 1, bias: bool = True, - padding_mode: str = "zeros", + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", *, params_dtype: torch.dtype | None = None, ) -> None: @@ -36,6 +37,22 @@ def __init__( if params_dtype is None: params_dtype = torch.get_default_dtype() + valid_padding_strings = {"same", "valid"} + if isinstance(padding, str) and padding not in valid_padding_strings: + raise ValueError( + f"Invalid padding string '{padding}'. " + f"Expected one of {valid_padding_strings}." + ) + + if padding == "same": + padding = ( + kernel_size // 2 + if isinstance(kernel_size, int) + else tuple(k // 2 for k in kernel_size) + ) + elif padding == "valid": + padding = 0 + kernel_size = ( (kernel_size,) * self.num_dim if isinstance(kernel_size, int) @@ -45,6 +62,9 @@ def __init__( padding = (padding,) * self.num_dim if isinstance(padding, int) else padding dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation + if padding == "same" and any(s != 1 for s in stride): + raise ValueError("padding='same' is not supported for strided convolutions") + self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 5872e8196ead..3d000f3ac3ab 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -12,6 +12,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -58,7 +59,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class AIMv2PatchEmbed(nn.Module): def __init__(self, config: AIMv2Config): super().__init__() - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( config.num_channels, config.hidden_size, kernel_size=(config.patch_size, config.patch_size), diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 2e4f73312efa..f31f99c0592b 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -12,6 +12,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -47,7 +48,7 @@ def __init__(self, config: BlipVisionConfig | Blip2VisionConfig): self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index fb7476c45fcd..3c87bbfefab3 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -22,6 +22,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -549,7 +550,7 @@ def forward(self, hidden_state: torch.Tensor): class ChameleonVQVAEEncoderConvDownsample(nn.Module): def __init__(self, in_channels: int): super().__init__() - self.conv = nn.Conv2d( + self.conv = Conv2dLayer( in_channels, in_channels, kernel_size=3, stride=2, padding=0 ) @@ -577,23 +578,23 @@ def __init__( self.norm1 = torch.nn.GroupNorm( num_groups=32, num_channels=in_channels, eps=1e-6, affine=True ) - self.conv1 = torch.nn.Conv2d( + self.conv1 = Conv2dLayer( in_channels, out_channels, kernel_size=3, stride=1, padding=1 ) self.norm2 = torch.nn.GroupNorm( num_groups=32, num_channels=out_channels, eps=1e-6, affine=True ) self.dropout = torch.nn.Dropout(config.dropout) - self.conv2 = torch.nn.Conv2d( + self.conv2 = Conv2dLayer( out_channels, out_channels, kernel_size=3, stride=1, padding=1 ) if self.in_channels != self.out_channels: if self.use_conv_shortcut: - self.conv_shortcut = torch.nn.Conv2d( + self.conv_shortcut = Conv2dLayer( in_channels, out_channels, kernel_size=3, stride=1, padding=1 ) else: - self.nin_shortcut = torch.nn.Conv2d( + self.nin_shortcut = Conv2dLayer( in_channels, out_channels, kernel_size=1, stride=1, padding=0 ) @@ -626,16 +627,16 @@ def __init__(self, in_channels: int): self.norm = torch.nn.GroupNorm( num_groups=32, num_channels=in_channels, eps=1e-6, affine=True ) - self.q = torch.nn.Conv2d( + self.q = Conv2dLayer( in_channels, in_channels, kernel_size=1, stride=1, padding=0 ) - self.k = torch.nn.Conv2d( + self.k = Conv2dLayer( in_channels, in_channels, kernel_size=1, stride=1, padding=0 ) - self.v = torch.nn.Conv2d( + self.v = Conv2dLayer( in_channels, in_channels, kernel_size=1, stride=1, padding=0 ) - self.proj_out = torch.nn.Conv2d( + self.proj_out = Conv2dLayer( in_channels, in_channels, kernel_size=1, stride=1, padding=0 ) @@ -681,7 +682,7 @@ def __init__(self, config: ChameleonVQVAEConfig): latent_channels = config.latent_channels channel_multiplier = config.channel_multiplier - self.conv_in = torch.nn.Conv2d( + self.conv_in = Conv2dLayer( in_channels, base_channels, kernel_size=3, stride=1, padding=1 ) @@ -738,7 +739,7 @@ def __init__(self, config: ChameleonVQVAEConfig): self.norm_out = torch.nn.GroupNorm( num_groups=32, num_channels=block_in, eps=1e-6, affine=True ) - self.conv_out = torch.nn.Conv2d( + self.conv_out = Conv2dLayer( block_in, 2 * latent_channels if double_latent else latent_channels, kernel_size=3, @@ -779,10 +780,8 @@ def __init__(self, config: ChameleonVQVAEConfig): super().__init__() self.encoder = ChameleonVQVAEEncoder(config) self.quantize = ChameleonVQVAEVectorQuantizer(config) - self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1) - self.post_quant_conv = torch.nn.Conv2d( - config.embed_dim, config.latent_channels, 1 - ) + self.quant_conv = Conv2dLayer(config.latent_channels, config.embed_dim, 1) + self.post_quant_conv = Conv2dLayer(config.embed_dim, config.latent_channels, 1) self.eval() # Chameleon's VQ model is frozen def encode( diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index e62a57eccc95..8f1660891fcb 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -19,6 +19,7 @@ from transformers import CLIPVisionConfig from vllm.attention.layer import MultiHeadAttention +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -133,14 +134,14 @@ def __init__( self.blocks.append(block) self.neck = nn.Sequential( - nn.Conv2d( + Conv2dLayer( embed_dim, out_chans, kernel_size=1, bias=False, ), LayerNorm2d(out_chans), - nn.Conv2d( + Conv2dLayer( out_chans, out_chans, kernel_size=3, @@ -150,8 +151,10 @@ def __init__( LayerNorm2d(out_chans), ) - self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) - self.net_3 = nn.Conv2d( + self.net_2 = Conv2dLayer( + 256, 512, kernel_size=3, stride=2, padding=1, bias=False + ) + self.net_3 = Conv2dLayer( 512, 1024, kernel_size=3, stride=2, padding=1, bias=False ) @@ -500,7 +503,7 @@ def __init__( """ super().__init__() - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding ) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index f46caaa095c6..2d2251e83b5b 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -22,6 +22,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -471,7 +472,7 @@ def __init__(self, config): self.temporal_patch_size = config.temporal_patch_size self.embed_dim = config.embed_dim self.config = config - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( config.num_channels, config.embed_dim, kernel_size=(config.patch_size, config.patch_size), diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 65c3fc2d9e97..2c2f45c2453e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -56,7 +56,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor.layers.conv import Conv3dLayer +from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -734,7 +734,7 @@ def __init__( self.post_conv_layernorm = RMSNorm( vision_config.hidden_size, eps=vision_config.rms_norm_eps ) - self.downsample = nn.Conv2d( + self.downsample = Conv2dLayer( in_channels=vision_config.hidden_size, out_channels=vision_config.out_hidden_size, kernel_size=vision_config.spatial_merge_size, diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 1c18ea0745f2..514082cf60ce 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, @@ -78,7 +79,7 @@ class GLMVImagePixelInputs(TensorSchema): class EVA2CLIPPatchEmbedding(nn.Module): def __init__(self, config): super().__init__() - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( config.in_channels, config.hidden_size, kernel_size=config.patch_size, @@ -333,7 +334,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.linear_proj", ) - self.conv = nn.Conv2d( + self.conv = Conv2dLayer( in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 727c8ec0397c..06b8468e18db 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -30,6 +30,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -60,7 +61,7 @@ def __init__(self, config: Idefics2VisionConfig): self.embed_dim = config.hidden_size self.image_size = config.image_size self.patch_size = config.patch_size - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 03918127c6ae..61aeafc2ab43 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -24,6 +24,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -51,7 +52,7 @@ def __init__(self, config: PretrainedConfig): self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 507503d75046..cb0414bbc95a 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -16,6 +16,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -43,7 +44,7 @@ def __init__(self, config): self.num_patches = num_patches self.patch_shape = patch_shape - self.projection = nn.Conv2d( + self.projection = Conv2dLayer( num_channels, hidden_size, kernel_size=patch_size, stride=patch_size ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 1eb0eccc0411..8fc3db296aa7 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -204,7 +205,7 @@ def __init__(self, config: PretrainedConfig): self.image_size = config.image_size self.patch_size = config.patch_size - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index a84c99059cd9..d9b23811730d 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -39,6 +39,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -120,7 +121,7 @@ def __init__( self.num_patches = self.grid_size[0] * self.grid_size[1] self.flatten = flatten - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( in_chans, embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 8017c947bf9a..2e3e6dc166ad 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -53,6 +53,7 @@ from transformers.modeling_utils import PreTrainedModel from transformers.utils import is_flash_attn_2_available +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.models.utils import maybe_prefix from vllm.transformers_utils.configs.moonvit import MoonViTConfig @@ -244,7 +245,7 @@ def __init__( ) self.patch_size = patch_size - self.proj = nn.Conv2d( + self.proj = Conv2dLayer( in_dim, out_dim, kernel_size=patch_size, stride=patch_size ) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 3ef6470070d1..dee0c16ab0f6 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -45,6 +45,7 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -419,7 +420,7 @@ def __init__(self, config: PretrainedConfig): self.image_size = config.image_size self.patch_size = config.patch_size - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 8cb7d6a889da..8a034fd72b02 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -31,6 +31,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -747,7 +748,7 @@ class VisionTransformer(nn.Module): def __init__(self, args: VisionEncoderArgs): super().__init__() self.args = args - self.patch_conv = nn.Conv2d( + self.patch_conv = Conv2dLayer( in_channels=args.num_channels, out_channels=args.hidden_size, kernel_size=args.patch_size, @@ -1212,7 +1213,7 @@ def __init__( self.config = config - self.patch_conv = nn.Conv2d( + self.patch_conv = Conv2dLayer( in_channels=config.num_channels, out_channels=config.hidden_size, kernel_size=config.patch_size, diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 6a259cade9cf..4906cf441f6f 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -25,6 +25,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, ReplicatedLinear, @@ -333,7 +334,7 @@ def __init__( patch_height, patch_width = self.patch_size = (patch_size, patch_size) self.grid_size = (image_height // patch_height, image_width // patch_width) self.output_dim = output_dim - self.conv1 = nn.Conv2d( + self.conv1 = Conv2dLayer( in_channels=3, out_channels=width, kernel_size=patch_size, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 42d906d089f9..ce5847bf79a5 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -286,7 +287,7 @@ def __init__(self, config: SiglipVisionConfig): self.image_size = config.image_size self.patch_size = config.patch_size - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index 29dd164ad37f..46f5e67d659e 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -16,6 +16,7 @@ from vllm.attention.layer import maybe_get_vit_flash_attn_backend from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, LinearBase, @@ -67,7 +68,7 @@ def __init__(self, config: PretrainedConfig): self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim) else: - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, @@ -99,7 +100,7 @@ def forward( target_dtype = self.patch_embedding.weight.dtype if isinstance(self.patch_embedding, LinearBase): patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) - elif isinstance(self.patch_embedding, nn.Conv2d): + elif isinstance(self.patch_embedding, Conv2dLayer): pixel_values = pixel_values.view( -1, self.config.num_channels * self.config.temporal_patch_size, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 5d16be1eb312..1c60cb414812 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -20,6 +20,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -667,7 +668,7 @@ def __init__(self, config: Step3VisionEncoderConfig): self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim)) - self.patch_embedding = nn.Conv2d( + self.patch_embedding = Conv2dLayer( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, @@ -950,13 +951,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: prefix=maybe_prefix(prefix, "vision_model"), use_data_parallel=self.use_data_parallel, ) - self.vit_downsampler = nn.Conv2d( + self.vit_downsampler = Conv2dLayer( config.vision_config.hidden_size, config.vision_config.output_hidden_size, kernel_size=2, stride=config.understand_projector_stride, ) - self.vit_downsampler2 = nn.Conv2d( + self.vit_downsampler2 = Conv2dLayer( config.vision_config.output_hidden_size, config.vision_config.output_hidden_size * 2, kernel_size=3, From c3e29786209d91d3842e839b62f4d1d815902262 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 18 Nov 2025 13:03:23 -0600 Subject: [PATCH 162/578] [NIXL] fix cpu PD after physical <> logical block_size PR (#28904) Signed-off-by: Chendi Xue --- .../nixl_integration/run_accuracy_test.sh | 9 +++++++-- tools/install_nixl_from_source_ubuntu.py | 1 + .../kv_transfer/kv_connector/v1/nixl_connector.py | 12 +++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 87c9a105e936..453ccc81eb14 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -55,7 +55,7 @@ DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) -SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "") # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -91,8 +91,13 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" - else + elif [[ "$SMI_BIN" == *"rocm"* ]]; then echo "$($SMI_BIN -l | grep GPU | wc -l)" + else + # works for non-cuda platforms, + # assuming at least 1 device and + # let system to decide which card to use + echo "1" fi } diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index a786abba95ad..b8a55c615426 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -95,6 +95,7 @@ def install_system_dependencies(): "meson", "libtool", "libtool-bin", + "pkg-config", ] run_command(["apt-get", "update"]) run_command(["apt-get", "install", "-y"] + apt_packages) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 5ff95876ef34..1626f819af8b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1161,6 +1161,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): # to better exploit the memory layout (ie num_blocks is the first dim). split_k_and_v = self.kv_topo.split_k_and_v tensor_size_bytes = None + + # TODO (NickLucche): Get kernel_block_size in a cleaner way + # NHD default "view" for non-MLA cache + if self.device_type == "cpu": + block_size_position = -2 + else: + block_size_position = -2 if self.use_mla else -3 + # Enable different block lengths for different layers when MLA is used. self.block_len_per_layer = list[int]() self.slot_size_per_layer = list[int]() # HD bytes in kv terms @@ -1175,9 +1183,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): if base_addr in seen_base_addresses: continue - # TODO (NickLucche): Get kernel_block_size in a cleaner way - # NHD default "view" for non-MLA cache - kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3] + kernel_block_size = cache.shape[block_size_position] if self.block_size != kernel_block_size: logger.info_once( From 2a2d5d2780bf25035438263605c7784f12afb718 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 19 Nov 2025 03:34:36 +0800 Subject: [PATCH 163/578] Replace `torch.cuda.Event` with `torch.Event` for better hardware compatibility (#26985) Signed-off-by: Kunshang Ji --- benchmarks/kernels/benchmark_cutlass_moe_fp8.py | 4 ++-- benchmarks/kernels/benchmark_moe.py | 4 ++-- .../kernels/benchmark_moe_permute_unpermute.py | 8 ++++---- .../kernels/benchmark_per_token_group_quant.py | 4 ++-- benchmarks/kernels/benchmark_silu_mul_fp8_quant.py | 4 ++-- .../kernels/benchmark_trtllm_decode_attention.py | 4 ++-- .../kernels/benchmark_trtllm_prefill_attention.py | 4 ++-- benchmarks/kernels/benchmark_w8a8_block_fp8.py | 4 ++-- tests/kernels/attention/test_merge_attn_states.py | 8 ++++---- vllm/v1/kv_offload/worker/cpu_gpu.py | 6 +++--- vllm/v1/worker/cpu_model_runner.py | 6 +++--- vllm/v1/worker/gpu_input_batch.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 12 ++++++------ vllm/v1/worker/ubatching.py | 8 ++++---- vllm/v1/worker/xpu_model_runner.py | 9 +-------- 15 files changed, 41 insertions(+), 48 deletions(-) diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 027f67ad4db6..e07d6c776bc0 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -255,8 +255,8 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100): torch.cuda.synchronize() # Timing - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies = [] for _ in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c99951aa2782..a1af0b8aec3d 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -185,8 +185,8 @@ def run(): graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index efa5a7386027..b8913a217c60 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -105,8 +105,8 @@ def run(): graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): @@ -241,8 +241,8 @@ def run(input: tuple): graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index bdc1eb733084..eba4d510258b 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -30,8 +30,8 @@ def _time_cuda( fn() torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) start.record() for _ in range(bench_iters): diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index a5887aafd30d..de01ff197eab 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -253,8 +253,8 @@ def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"): ) torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) # Benchmark latencies: list[float] = [] diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 29ce18234dfa..1d0d6fbb9a47 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -127,8 +127,8 @@ def benchmark_decode( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 2a25d0374811..84bde723abf7 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -139,8 +139,8 @@ def benchmark_prefill( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index ab54f81985bc..b52500c8c521 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -183,8 +183,8 @@ def run(): run() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index 9b084f2f660b..c7662223e1ca 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -150,8 +150,8 @@ def test_merge_attn_states( output_torch = output.clone() output_lse_torch = output_lse.clone() total_time_torch_kernel = 0 - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) # 0. Run the Torch kernel prefix_lse_torch = prefix_lse.clone() @@ -188,8 +188,8 @@ def test_merge_attn_states( output_lse_ref_triton = output_lse.clone() total_time_triton_kernel = 0 - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) for _ in range(warmup_times): merge_attn_states_triton( diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 646f9d0d7542..0f2ec4a1b41f 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -68,9 +68,9 @@ def __init__( self.h2d_stream = torch.cuda.Stream() # job_id -> transfer cuda event - self.transfer_events: dict[int, torch.cuda.Event] = {} + self.transfer_events: dict[int, torch.Event] = {} # list of cuda events available for re-use - self.events_pool: list[torch.cuda.Event] = [] + self.events_pool: list[torch.Event] = [] pin_memory = is_pin_memory_available() @@ -153,7 +153,7 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: ) src_to_dst_tensor = torch.from_numpy(src_to_dst) - event = self.events_pool.pop() if self.events_pool else torch.cuda.Event() + event = self.events_pool.pop() if self.events_pool else torch.Event() with torch.cuda.stream(stream): for src_tensor, dst_tensor, kv_dim in zip( src_tensors, dst_tensors, self.kv_dim_before_num_blocks diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 40f011fed1ad..6bfbc32d598f 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -96,14 +96,14 @@ class _StreamPlaceholder: def __init__(self, *args, **kwargs) -> None: pass - cuda_event = torch.cuda.Event + cuda_event = torch.Event cuda_stream = torch.cuda.Stream try: - torch.cuda.Event = _EventPlaceholder + torch.Event = _EventPlaceholder torch.cuda.Stream = _StreamPlaceholder yield finally: - torch.cuda.Event = cuda_event + torch.Event = cuda_event torch.cuda.Stream = cuda_stream diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 7cf6afa3fc37..023b5edb2c34 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -265,7 +265,7 @@ def __init__( # ids from prior step, if required by current sampling params # (e.g. penalties). self.sampled_token_ids_cpu: torch.Tensor | None = None - self.async_copy_ready_event: torch.cuda.Event | None = None + self.async_copy_ready_event: torch.Event | None = None @property def req_ids(self) -> list[str]: @@ -891,7 +891,7 @@ def make_lora_inputs( def set_async_sampled_token_ids( self, sampled_token_ids_cpu: torch.Tensor, - async_copy_ready_event: torch.cuda.Event, + async_copy_ready_event: torch.Event, ) -> None: """ In async scheduling case, store ref to sampled_token_ids_cpu diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 67f575f92cc6..506118d2d762 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -185,7 +185,7 @@ def __init__( self._invalid_req_indices = invalid_req_indices # Event on the copy stream so we can synchronize the non-blocking copy. - self.async_copy_ready_event = torch.cuda.Event() + self.async_copy_ready_event = torch.Event() # Keep a reference to the device tensor to avoid it being # deallocated until we finish copying it to the host. @@ -435,10 +435,10 @@ def __init__( self.async_output_copy_stream: torch.cuda.Stream | None = None # cuda event to synchronize use of reused CPU tensors between steps # when async scheduling is enabled. - self.prepare_inputs_event: torch.cuda.Event | None = None + self.prepare_inputs_event: torch.Event | None = None if self.use_async_scheduling: self.async_output_copy_stream = torch.cuda.Stream() - self.prepare_inputs_event = torch.cuda.Event() + self.prepare_inputs_event = torch.Event() # self.cudagraph_batch_sizes sorts in ascending order. if ( @@ -549,7 +549,7 @@ def __init__( # Cached outputs. self._draft_token_ids: list[list[int]] | torch.Tensor | None = None - self.transfer_event = torch.cuda.Event() + self.transfer_event = torch.Event() self.sampled_token_ids_pinned_cpu = torch.empty( (self.max_num_reqs, 1), dtype=torch.int64, @@ -559,10 +559,10 @@ def __init__( # Pre-allocated tensor for copying valid sampled token counts to CPU, # with dedicated stream for overlapping and event for coordination. - self.valid_sampled_token_count_event: torch.cuda.Event | None = None + self.valid_sampled_token_count_event: torch.Event | None = None self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None if self.use_async_scheduling and self.num_spec_tokens: - self.valid_sampled_token_count_event = torch.cuda.Event() + self.valid_sampled_token_count_event = torch.Event() self.valid_sampled_token_count_copy_stream = torch.cuda.Stream() self.valid_sampled_token_count_cpu = torch.empty( self.max_num_reqs, diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index 9f16b1e6d03e..be8326e2fdbc 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -27,8 +27,8 @@ def __init__( ready_barrier: threading.Barrier, cpu_wait_event: threading.Event, cpu_signal_event: threading.Event, - gpu_comm_done_event: torch.cuda.Event, - gpu_compute_done_event: torch.cuda.Event, + gpu_comm_done_event: torch.Event, + gpu_compute_done_event: torch.Event, schedule: str = "default", ): self.id = id @@ -207,8 +207,8 @@ def make_ubatch_contexts( Create a context manager for micro-batching synchronization. """ cpu_events = [threading.Event() for _ in range(num_micro_batches)] - gpu_comm_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)] - gpu_compute_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)] + gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)] + gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)] assert len(forward_contexts) == 2 diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index 4f82c18da73a..30563305853a 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -37,19 +37,12 @@ def _sync_device(self) -> None: @contextmanager def _torch_cuda_wrapper(): - class _EventPlaceholder: - def __init__(self, *args, **kwargs) -> None: - self.record = lambda: None - self.synchronize = lambda: None - try: # replace cuda APIs with xpu APIs, this should work by default - torch.cuda.Event = torch.xpu.Event torch.cuda.Stream = torch.xpu.Stream torch.cuda.default_stream = torch.xpu.current_stream torch.cuda.current_stream = torch.xpu.current_stream torch.cuda.stream = torch.xpu.stream yield finally: - # if anything goes wrong, just patch it with a placeholder - torch.cuda.Event = _EventPlaceholder + pass From 67745d189fd981ee824bde35666a3737a962c031 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 18 Nov 2025 15:29:06 -0500 Subject: [PATCH 164/578] Supress verbose logs from model_hosting_container_standards (#28949) Signed-off-by: mgoin --- vllm/entrypoints/openai/api_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3cf66fcd27e2..3974f45a7135 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -5,6 +5,7 @@ import importlib import inspect import json +import logging import multiprocessing import multiprocessing.forkserver as forkserver import os @@ -2020,6 +2021,9 @@ async def run_server(args, **uvicorn_kwargs) -> None: # Add process-specific prefix to stdout and stderr. decorate_logs("APIServer") + # Suppress verbose logs from model_hosting_container_standards + logging.getLogger("model_hosting_container_standards").setLevel(logging.ERROR) + listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) From 49ef847aa82c93615d5d86fac81e4716e9cd27cd Mon Sep 17 00:00:00 2001 From: Johnny Date: Wed, 19 Nov 2025 01:44:27 +0100 Subject: [PATCH 165/578] [NVIDIA] Guard SM100 CUTLASS MoE macro to SM100 builds v2 (#28938) Signed-off-by: johnnynunez Signed-off-by: Johnny --- CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a37040edbf1..c1c7478b9f3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -512,9 +512,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS @@ -619,9 +619,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # FP4 Archs and flags if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS @@ -695,7 +695,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") @@ -741,9 +741,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") From 9912b8ccb861593d76216afa583ac593faf5a309 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 18 Nov 2025 19:45:20 -0500 Subject: [PATCH 166/578] [Build] Add OpenAI triton_kernels (#28788) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .gitignore | 3 ++ CMakeLists.txt | 5 ++ cmake/external_projects/triton_kernels.cmake | 53 +++++++++++++++++++ setup.py | 17 ++++++ .../layers/quantization/utils/mxfp4_utils.py | 2 + vllm/utils/import_utils.py | 40 +++++++++++++- 6 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 cmake/external_projects/triton_kernels.cmake diff --git a/.gitignore b/.gitignore index 50070d7898fe..7cda86478664 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +# OpenAI triton kernels copied from source +vllm/third_party/triton_kernels/* + # triton jit .triton diff --git a/CMakeLists.txt b/CMakeLists.txt index c1c7478b9f3e..ae8e6175443f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1030,6 +1030,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP") WITH_SOABI) endif() +# For CUDA and HIP builds also build the triton_kernels external package. +if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") + include(cmake/external_projects/triton_kernels.cmake) +endif() + # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake new file mode 100644 index 000000000000..d35ad123dd9d --- /dev/null +++ b/cmake/external_projects/triton_kernels.cmake @@ -0,0 +1,53 @@ +# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels + +set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0") + +# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to +# be directly set to the triton_kernels python directory. +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}") + FetchContent_Declare( + triton_kernels + SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR} + ) + +else() + set(TRITON_GIT "https://github.com/triton-lang/triton.git") + message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}") + FetchContent_Declare( + triton_kernels + # TODO (varun) : Fetch just the triton_kernels directory from Triton + GIT_REPOSITORY https://github.com/triton-lang/triton.git + GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG} + GIT_PROGRESS TRUE + SOURCE_SUBDIR python/triton_kernels/triton_kernels + ) +endif() + +# Fetch content +FetchContent_MakeAvailable(triton_kernels) + +if (NOT triton_kernels_SOURCE_DIR) + message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR") +endif() + +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/") +else() + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/") +endif() + +message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}") + +add_custom_target(triton_kernels) + +# Ensure the vllm/third_party directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")") + +## Copy .py files to install directory. +install(DIRECTORY + ${TRITON_KERNELS_PYTHON_DIR} + DESTINATION + vllm/third_party/triton_kernels/ + COMPONENT triton_kernels + FILES_MATCHING PATTERN "*.py") diff --git a/setup.py b/setup.py index e9b36e2a2e03..5591bcb13244 100644 --- a/setup.py +++ b/setup.py @@ -299,6 +299,20 @@ def run(self): os.makedirs(os.path.dirname(dst_file), exist_ok=True) self.copy_file(file, dst_file) + if _is_cuda() or _is_hip(): + # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib + # to current directory so that they can be included in the editable + # build + print( + f"Copying {self.build_lib}/vllm/third_party/triton_kernels " + "to vllm/third_party/triton_kernels" + ) + shutil.copytree( + f"{self.build_lib}/vllm/third_party/triton_kernels", + "vllm/third_party/triton_kernels", + dirs_exist_ok=True, + ) + class precompiled_build_ext(build_ext): """Disables extension building when using precompiled binaries.""" @@ -633,6 +647,9 @@ def _read_requirements(filename: str) -> list[str]: if _is_cuda() or _is_hip(): ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) + # Optional since this doesn't get built (produce an .so file). This is just + # copying the relevant .py files from the source repository. + ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True)) if _is_hip(): ext_modules.append(CMakeExtension(name="vllm._rocm_C")) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 34a31bcf6a74..cbc46810a26a 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import triton +from vllm.utils.import_utils import has_triton_kernels from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer logger = init_logger(__name__) @@ -15,6 +16,7 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel""" + assert has_triton_kernels() import triton_kernels.matmul_ogs_details.opt_flags as opt_flags from triton_kernels.numerics import InFlexData from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index f01d2c7a6a33..ff0f0350fd94 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -18,6 +18,10 @@ import regex as re from typing_extensions import Never +from vllm.logger import init_logger + +logger = init_logger(__name__) + # TODO: This function can be removed if transformer_modules classes are # serialized by value when communicating between processes @@ -62,6 +66,35 @@ def import_pynvml(): return pynvml +@cache +def import_triton_kernels(): + """ + For convenience, prioritize triton_kernels that is available in + `site-packages`. Use `vllm.third_party.triton_kernels` as a fall-back. + """ + if _has_module("triton_kernels"): + import triton_kernels + + logger.debug_once( + f"Loading module triton_kernels from {triton_kernels.__file__}.", + scope="local", + ) + elif _has_module("vllm.third_party.triton_kernels"): + import vllm.third_party.triton_kernels as triton_kernels + + logger.debug_once( + f"Loading module triton_kernels from {triton_kernels.__file__}.", + scope="local", + ) + sys.modules["triton_kernels"] = triton_kernels + else: + logger.info_once( + "triton_kernels unavailable in this build. " + "Please consider installing triton_kernels from " + "https://github.com/triton-lang/triton/tree/main/python/triton_kernels" + ) + + def import_from_path(module_name: str, file_path: str | os.PathLike): """ Import a Python file according to its file path. @@ -397,7 +430,12 @@ def has_deep_gemm() -> bool: def has_triton_kernels() -> bool: """Whether the optional `triton_kernels` package is available.""" - return _has_module("triton_kernels") + is_available = _has_module("triton_kernels") or _has_module( + "vllm.third_party.triton_kernels" + ) + if is_available: + import_triton_kernels() + return is_available def has_tilelang() -> bool: From 1395461f5fb76145433c1dc8a3b7262ee3799bf8 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Wed, 19 Nov 2025 02:49:36 +0200 Subject: [PATCH 167/578] [Hybrid][torch.compile] Refactor mamba2 forward to avoid obscuring linear projections under custom op (#28587) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../layers/mamba/mamba_mixer2.py | 156 ++++++++++-------- vllm/model_executor/models/bamba.py | 3 +- vllm/model_executor/models/falcon_h1.py | 4 +- .../model_executor/models/granitemoehybrid.py | 3 +- vllm/model_executor/models/mamba2.py | 3 +- vllm/model_executor/models/nemotron_h.py | 3 +- vllm/model_executor/models/zamba2.py | 6 +- 7 files changed, 90 insertions(+), 88 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index fb45afa33dad..57313990b820 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -426,6 +426,10 @@ def __init__( # `ColumnParallelLinear` and `MergedColumnParallelLinear`, # and `set_weight_attrs` doesn't allow to override it self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + conv_weights = self.conv1d.weight.view( + self.conv1d.weight.size(0), self.conv1d.weight.size(2) + ) + self.register_buffer("conv_weights", conv_weights, persistent=False) # - these are TPed by heads to reduce the size of the # temporal shape @@ -459,6 +463,17 @@ def __init__( intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps ) + # - get hidden_states, B and C after depthwise convolution. + self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split( + hidden_states_B_C, + [ + self.intermediate_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + ], + dim=-1, + ) + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") @@ -470,10 +485,24 @@ def __init__( self.cache_config = cache_config self.prefix = prefix + # Pre-compute sizes for forward pass + self.tped_intermediate_size = self.intermediate_size // self.tp_size + self.tped_conv_size = self.conv_dim // self.tp_size + self.tped_dt_size = self.num_heads // self.tp_size + + self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split( + hidden_states_B_C, + [ + self.tped_intermediate_size, + self.groups_ssm_state_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + ], + dim=-1, + ) + def forward_native( self, hidden_states: torch.Tensor, - output: torch.Tensor, mup_vector: torch.Tensor | None = None, ): pass @@ -481,22 +510,55 @@ def forward_native( def forward( self, hidden_states: torch.Tensor, - output: torch.Tensor, mup_vector: torch.Tensor | None = None, ): + # 1. Gated MLP's linear projection + projected_states, _ = self.in_proj(hidden_states) + if mup_vector is not None: + projected_states = projected_states * mup_vector + + # 2. Prepare inputs for conv + SSM + ssm_output = torch.empty( + [ + hidden_states.shape[0], + (self.num_heads // self.tp_size) * self.head_dim, + ], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + + # 3. conv + SSM + # (split `projected_states` into hidden_states_B_C, dt in the custom op to + # ensure it is not treated as an intermediate tensor by torch compile) torch.ops.vllm.mamba_mixer2( - hidden_states, - output, + projected_states, + ssm_output, self.prefix, - mup_vector, ) - def forward_cuda( + # 4. gated MLP + # GatedRMSNorm internally applying SiLU to the gate + # SiLU is applied internally before normalization, unlike standard + # norm usage + gate = projected_states[..., : self.tped_intermediate_size] + hidden_states = self.norm(ssm_output, gate) + + # 5. Final linear projection + output, _ = self.out_proj(hidden_states) + + return output + + def conv_ssm_forward( self, - hidden_states: torch.Tensor, + projected_states: torch.Tensor, output: torch.Tensor, - mup_vector: torch.Tensor | None = None, ): + hidden_states_B_C, dt = torch.split( + projected_states[..., self.tped_intermediate_size :], + [self.tped_conv_size, self.tped_dt_size], + dim=-1, + ) + forward_context = get_forward_context() # attn_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill @@ -524,46 +586,13 @@ def forward_cuda( cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p last_chunk_indices_p = attn_metadata.last_chunk_indices_p - # 1. Gated MLP's linear projection - projected_states, _ = self.in_proj(hidden_states) - - if mup_vector is not None: - projected_states = projected_states * mup_vector - - gate, hidden_states_B_C, dt = torch.split( - projected_states, - [ - self.intermediate_size // self.tp_size, - self.conv_dim // self.tp_size, - self.num_heads // self.tp_size, - ], - dim=-1, - ) - - conv_weights = self.conv1d.weight.view( - self.conv1d.weight.size(0), self.conv1d.weight.size(2) - ) - - # - get hidden_states, B and C after depthwise convolution. - split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split( - hidden_states_B_C, - [ - self.intermediate_size // self.tp_size, - self.groups_ssm_state_size // self.tp_size, - self.groups_ssm_state_size // self.tp_size, - ], - dim=-1, - ) - if attn_metadata is None: # profile run hidden_states_B_C = ( hidden_states_B_C.transpose(0, 1).clone().transpose(0, 1) ).contiguous() - hidden_states, _B, _C = split_hidden_states_B_C_fn(hidden_states_B_C) - hidden_states = self.norm(hidden_states, gate) - out, _ = self.out_proj(hidden_states) - return out + hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C) + return hidden_states # NOTE: V0 put prefill before decode, v1 puts decode before prefill num_prefills = attn_metadata.num_prefills # request count @@ -622,18 +651,8 @@ def forward_cuda( block_idx_first_scheduled_token_p = None num_computed_tokens_p = None - # Preallocate output tensor to avoid memcpy cost for merging prefill - # and decode outputs - preallocated_ssm_out = torch.empty( - [ - num_prefill_tokens + num_decodes, - (self.num_heads // self.tp_size) * self.head_dim, - ], - dtype=hidden_states.dtype, - device=hidden_states.device, - ) preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( - preallocated_ssm_out, + output[:num_actual_tokens], [num_decodes, num_prefill_tokens], dim=0, ) @@ -658,7 +677,7 @@ def forward_cuda( ) # this is the form that causal-conv see hidden_states_B_C_p = causal_conv1d_fn( x, - conv_weights, + self.conv_weights, self.conv1d.bias, activation=self.activation, conv_states=conv_state, @@ -673,7 +692,9 @@ def forward_cuda( query_start_loc=query_start_loc_p, ).transpose(0, 1)[:num_prefill_tokens] - hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p) + hidden_states_p, B_p, C_p = self.split_hidden_states_B_C_fn( + hidden_states_B_C_p + ) # 3. State Space Model sequence transformation initial_states = None @@ -815,7 +836,7 @@ def forward_cuda( hidden_states_B_C_d = causal_conv1d_update( hidden_states_B_C_d, conv_state, - conv_weights, + self.conv_weights, self.conv1d.bias, self.activation, conv_state_indices=state_indices_tensor_d, @@ -823,7 +844,9 @@ def forward_cuda( initial_state_idx=block_idx_last_computed_token_d, ) - hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d) + hidden_states_d, B_d, C_d = self.split_hidden_states_B_C_fn( + hidden_states_B_C_d + ) # 3. State Space Model sequence transformation n_groups = self.n_groups // self.tp_size @@ -861,15 +884,6 @@ def forward_cuda( out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim), ) - # 4. gated MLP - # GatedRMSNorm internally applying SiLU to the gate - # SiLU is applied internally before normalization, unlike standard - # norm usage - hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens]) - - # 5. Final linear projection - output[:num_actual_tokens], _ = self.out_proj(hidden_states) - def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: assert self.model_config is not None assert self.cache_config is not None @@ -901,21 +915,19 @@ def get_attn_backend(self) -> type["AttentionBackend"]: def mamba_mixer2( - hidden_states: torch.Tensor, + projected_states: torch.Tensor, output: torch.Tensor, layer_name: str, - mup_vector: torch.Tensor | None = None, ) -> None: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self.forward_cuda(hidden_states=hidden_states, output=output, mup_vector=mup_vector) + self.conv_ssm_forward(projected_states=projected_states, output=output) def mamba_mixer2_fake( - hidden_states: torch.Tensor, + projected_states: torch.Tensor, output: torch.Tensor, layer_name: str, - mup_vector: torch.Tensor | None = None, ) -> None: return diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index e0a2defd5127..c6cc83487fec 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -138,8 +138,7 @@ def forward( else: hidden_states, residual = self.input_layernorm(hidden_states, residual) - output = torch.empty_like(hidden_states) - self.mamba(hidden_states, output) + output = self.mamba(hidden_states) # Fully Connected hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 3653425b8e1c..b985847af5da 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -198,10 +198,8 @@ def forward( residual: torch.Tensor | None, **kwargs, ): - output = torch.empty_like(hidden_states) - self.mamba( + output = self.mamba( hidden_states, - output, mup_vector=self.mup_vector, ) return output, residual diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 05177f1d1ac2..a340112ec62a 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -115,8 +115,7 @@ def forward( ): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) - output = torch.empty_like(hidden_states) - self.mamba(hidden_states, output) + output = self.mamba(hidden_states) hidden_states = residual + output * self.residual_multiplier residual = hidden_states diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index fc17f98be198..5fcfa9431230 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -87,8 +87,7 @@ def forward( else: hidden_states, residual = self.norm(hidden_states, residual) - output = torch.empty_like(hidden_states) - self.mixer(hidden_states, output) + output = self.mixer(hidden_states) return output, residual diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index f7e0caf410e1..8675eff59222 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -376,8 +376,7 @@ def forward( else: hidden_states, residual = self.norm(hidden_states, residual) - output = torch.empty_like(hidden_states) - self.mixer(hidden_states, output) + output = self.mixer(hidden_states) return output, residual diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 64e6979c8fcf..729a9655d087 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -567,11 +567,7 @@ def forward( hidden_states = self.input_layernorm(hidden_states) # Process through Mamba mixer - output = torch.empty_like(hidden_states) - self.mamba( - hidden_states, - output, - ) + output = self.mamba(hidden_states) # residual connection after mamba hidden_states = residual + output From da94c7c0eb8dabea9c500dbd70fa042497497689 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 18 Nov 2025 16:52:41 -0800 Subject: [PATCH 168/578] Move online quantization to `model.load_weights` (#26327) Signed-off-by: Jerry Zhang --- examples/offline_inference/rlhf.py | 2 +- .../offline_inference/rlhf_online_quant.py | 162 ++++++++++++++ .../model_loader/default_loader.py | 46 +--- .../model_loader/online_quantization.py | 205 +++++++++++------- vllm/model_executor/model_loader/utils.py | 8 + vllm/model_executor/models/utils.py | 4 + 6 files changed, 314 insertions(+), 113 deletions(-) create mode 100644 examples/offline_inference/rlhf_online_quant.py diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 0c09e603271d..6f05968ce065 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -62,7 +62,7 @@ def __init__(self, *args, **kwargs): # Create a placement group that reserves GPU 1–2 for the vLLM inference engine. # Learn more about Ray placement groups: -# https://docs.ray.io/en/latest/placement-groups.html +# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) ray.get(pg_inference.ready()) scheduling_inference = PlacementGroupSchedulingStrategy( diff --git a/examples/offline_inference/rlhf_online_quant.py b/examples/offline_inference/rlhf_online_quant.py new file mode 100644 index 000000000000..2d98ad22c589 --- /dev/null +++ b/examples/offline_inference/rlhf_online_quant.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray. + +The script separates training and inference workloads onto distinct GPUs +so that Ray can manage process placement and inter-process communication. +A Hugging Face Transformer model occupies GPU 0 for training, whereas a +tensor-parallel vLLM inference engine occupies GPU 1–2. + +The example performs the following steps: + +* Load the training model on GPU 0. +* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism + and Ray placement groups. +* Generate text from a list of prompts using the inference engine. +* Update the weights of the training model and broadcast the updated weights + to the inference engine by using a Ray collective RPC group. Note that + for demonstration purposes we simply zero out the weights. + +For a production-ready implementation that supports multiple training and +inference replicas, see the OpenRLHF framework: +https://github.com/OpenRLHF/OpenRLHF + +This example assumes a single-node cluster with three GPUs, but Ray +supports multi-node clusters. vLLM expects the GPUs are only used for vLLM +workloads. Residual GPU activity interferes with vLLM memory profiling and +causes unexpected behavior. +""" + +import json +import os + +import ray +import torch +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from rlhf_utils import stateless_init_process_group +from torchao.core.config import config_to_dict +from torchao.quantization import ( + Float8DynamicActivationFloat8WeightConfig, + PerRow, +) +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams +from vllm.utils.network_utils import get_ip, get_open_port + + +class MyLLM(LLM): + """Configure the vLLM worker for Ray placement group execution.""" + + def __init__(self, *args, **kwargs): + # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray + # so that vLLM can manage its own device placement within the worker. + os.environ.pop("CUDA_VISIBLE_DEVICES", None) + super().__init__(*args, **kwargs) + + +# Load the OPT-125M model onto GPU 0 for the training workload. +train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +train_model.to("cuda:0") + +# Initialize Ray and set the visible devices. The vLLM engine will +# be placed on GPUs 1 and 2. +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +ray.init() + +# Create a placement group that reserves GPU 1–2 for the vLLM inference engine. +# Learn more about Ray placement groups: +# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html +pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) +ray.get(pg_inference.ready()) +scheduling_inference = PlacementGroupSchedulingStrategy( + placement_group=pg_inference, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=0, +) + +# Launch the vLLM inference engine. The `enforce_eager` flag reduces +# start-up latency. + +# generate torchao quantization config for RL rollout +# see https://github.com/vllm-project/vllm/pull/23014 for instructions to +# use serialized config files instead of passing around json string +config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()) + +json_str = json.dumps(config_to_dict(config)) + +llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=scheduling_inference, +)(MyLLM).remote( + model="facebook/opt-125m", + hf_overrides={"quantization_config_dict_json": json_str}, + enforce_eager=True, + worker_extension_cls="rlhf_utils.WorkerExtension", + tensor_parallel_size=2, + distributed_executor_backend="ray", +) + +# Generate text from the prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0) + +outputs = ray.get(llm.generate.remote(prompts, sampling_params)) + +print("-" * 50) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + +# Set up the communication channel between the training process and the +# inference engine. +master_address = get_ip() +master_port = get_open_port() + +handle = llm.collective_rpc.remote( + "init_weight_update_group", args=(master_address, master_port, 1, 3) +) + +model_update_group = stateless_init_process_group( + master_address, master_port, 0, 3, torch.device("cuda:0") +) +ray.get(handle) + +# Simulate a training step by zeroing out all model weights. +# In a real RLHF training loop the weights would be updated using the gradient +# from an RL objective such as PPO on a reward model. +for name, p in train_model.named_parameters(): + p.data.zero_() + +# Synchronize the updated weights to the inference engine. +for name, p in train_model.named_parameters(): + dtype_name = str(p.dtype).split(".")[-1] + handle = llm.collective_rpc.remote( + "update_weight", args=(name, dtype_name, p.shape) + ) + model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) + ray.get(handle) + +# Verify that the inference weights have been updated. +assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) + +# Generate text with the updated model. The output is expected to be nonsense +# because the weights are zero. +outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +print("-" * 50) +for output in outputs_updated: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index c06ac550a94a..b80026741781 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -22,6 +22,7 @@ fastsafetensors_weights_iterator, filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, + get_quant_config, maybe_download_from_modelscope, multi_thread_pt_weights_iterator, multi_thread_safetensors_weights_iterator, @@ -273,42 +274,17 @@ def download_model(self, model_config: ModelConfig) -> None: ) def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: - if model_config.quantization == "torchao" and torchao_version_at_least( - "0.14.0" - ): - self.load_config.safetensors_load_strategy = "torchao" - weights_to_load = {name for name, _ in model.named_parameters()} - - # if we don't have `model.weight_metadata_and_attr_saved` defined and - # set to True, it means that this is either offline quantization case - # or the first run of online quantization - # see online_quantization.py for detailed notes - offline_quantization_or_first_run_of_online_quantization = not getattr( - model, "weight_metadata_and_attr_saved", False - ) + if model_config.quantization == "torchao": + quant_config = get_quant_config(model_config, self.load_config) + if ( + hasattr(quant_config, "is_checkpoint_torchao_serialized") + and quant_config.is_checkpoint_torchao_serialized + and torchao_version_at_least("0.14.0") + ): + self.load_config.safetensors_load_strategy = "torchao" - if model_config.quantization is None: - # model is not quantized - loaded_weights = model.load_weights( - self.get_all_weights(model_config, model) - ) - elif offline_quantization_or_first_run_of_online_quantization: - # case 1: offline quantized checkpoint - # case 2: Step I1 first run of weight loading with - # online quantization - # see online_quantization.py for detailed notes - loaded_weights = model.load_weights( - self.get_all_weights(model_config, model) - ) - else: - # to avoid circular dependency - from vllm.model_executor.model_loader.online_quantization import ( - load_weights_and_online_quantize, - ) - - # subsequent runs of weight loading with online - # quantization - loaded_weights = load_weights_and_online_quantize(self, model, model_config) + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights(self.get_all_weights(model_config, model)) self.counter_after_loading_weights = time.perf_counter() logger.info_once( diff --git a/vllm/model_executor/model_loader/online_quantization.py b/vllm/model_executor/model_loader/online_quantization.py index 890dd7231a0e..f330af85bbe8 100644 --- a/vllm/model_executor/model_loader/online_quantization.py +++ b/vllm/model_executor/model_loader/online_quantization.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import types +from collections.abc import Iterable import torch from torch import nn from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.utils import process_weights_after_loading logger = init_logger(__name__) @@ -56,6 +56,9 @@ # R4. quantize weights (by calling process_weights_after_loading), # also set `process_weights_after_loading_already_called` to # True to stop it from running again +# R5. (workaround for cudagraph), we restore the weight params to original quantized +# weights params, and use original_weight_param.copy_(updated_weight_param) so that +# the weight update work well with cudagraph # process_weights_after_loading (if called): # this will be skipped since it's already ran in # load_weights @@ -69,14 +72,6 @@ def maybe_save_metadata_and_attributes_for_weight_reloading( if model_config.quantization != "torchao": return - if getattr(model, "process_weights_after_loading_already_called", False): - # In case `process_weights_after_loading` is called multiple times - # we'll skip it at later times - logger.warning( - "process_weights_after_loading already called for model %s", model - ) - return - from vllm.model_executor.model_loader.weight_utils import get_quant_config quant_config = get_quant_config(model_config, None) @@ -137,6 +132,7 @@ def maybe_save_metadata_and_attributes_for_weight_reloading( else: model.recorded_weight_attr[name][key] = attr # mark the metadata and attributes saved so we don't run it again + model._model_config = model_config model.weight_metadata_and_attr_saved = True @@ -148,77 +144,132 @@ def _bond_method_to_cls(func, obj): return types.MethodType(func, obj) -def load_weights_and_online_quantize( - model_loader: DefaultModelLoader, model: nn.Module, model_config: ModelConfig -) -> set[str]: +def support_quantized_model_reload_from_hp_weights(original_load_weights): + """Decorator for `load_weights` method for AutoWeightsLoader.load_weights to support + reloading high precision (bfloat16/float16/float32) weight for an already quantized + model, this involves restoring the weights to a high precision weights and + then online quantize the weights + """ # online quantization, right now only enabled for # torchao - # R1, R2, R3, R4 in the Notes - - # TODO: Add fp8 support - assert model_config.quantization == "torchao", ( - "online quantization is only enabled for torchao currently" - ) - # TODO: use create_weights to restore the weights to original state - - # Step R1: First restore the quantized weights to original bfloat16 - # weights, with original metadata (shape, dtype, device) - # and attributes, so that bfloat16 weights can be loaded properly - existing_param_names = dict(model.named_parameters(remove_duplicate=False)).keys() - named_modules = dict(model.named_modules(remove_duplicate=False)) - model_device = None - - # Step R2: recover the parameter to the state before first loading - for name, d in model.original_weights_rebuild_keys.items(): - _shape = d["shape"] - _dtype = d["dtype"] - _device = d["device"] + # R1, R2, R3, R4, R5 in the Notes + + def patched_model_load_weights( + auto_weight_loader, weights: Iterable[tuple[str, torch.Tensor]], *, mapper=None + ) -> set[str]: + model = auto_weight_loader.module + offline_quantization_or_first_run_of_online_quantization = not getattr( + model, "weight_metadata_and_attr_saved", False + ) + + # if we don't have `model.weight_metadata_and_attr_saved` defined and + # set to True, it means that this is either offline quantization case + # or the first run of online quantization + # see Notes in this file for more details + if offline_quantization_or_first_run_of_online_quantization: + # case 1: offline quantized checkpoint + # case 2: Step I1 first run of weight loading with + # online quantization + return original_load_weights(auto_weight_loader, weights, mapper=mapper) + + model_config = model._model_config + + # TODO: Add fp8 support + assert model_config.quantization == "torchao", ( + "online quantization is only enabled for torchao currently" + ) + # TODO: use create_weights to restore the weights to original state + + # Step R1: First restore the quantized weights to original bfloat16 + # weights, with original metadata (shape, dtype, device) + # and attributes, so that bfloat16 weights can be loaded properly + # TODO: maybe set remove_duplicate to True? + original_quantized_weight_dict = dict( + model.named_parameters(remove_duplicate=False) + ) + named_modules = dict(model.named_modules(remove_duplicate=False)) + model_device = None + + for name, d in model.original_weights_rebuild_keys.items(): + _shape = d["shape"] + _dtype = d["dtype"] + _device = d["device"] + if model_device is not None: + assert model_device == _device, ( + "Expecting all weights " + "to be in the same device for now, got both: " + f"{model_device} and {_device}" + ) + else: + model_device = _device + + if name in original_quantized_weight_dict: + module_name, weight_name = name.rsplit(".", 1) + module = named_modules[module_name] + setattr( + module, + weight_name, + torch.nn.Parameter( + torch.empty(_shape, dtype=_dtype, device=_device), + requires_grad=False, + ), + ) + + # Step R2: recover the weight attributes to the state before first loading + # recorded_weight_attr is + # {"weight_name": {"weight_attr_key": attr}} + # e.g. + # { + # { + # "layer.0.weight": { + # "weight_loader": weight_loader_function_object, + # "input_dim": 0, ... + # }, + # "layer.1.weight": ..., + # } + # } + for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items(): + for attr_name, attr in weight_attr_dict.items(): + module_name, weight_name = full_weight_name.rsplit(".", 1) + module = named_modules[module_name] + weight = getattr(module, weight_name) + if not hasattr(weight, attr_name): + setattr(weight, attr_name, _bond_method_to_cls(attr, weight)) + + # Step R3: reload bfloat16 / high precision weights + updated_params = original_load_weights( + auto_weight_loader, weights, mapper=mapper + ) + + # Step R4: online quantize the weights + # manually process weights after loading + model.process_weights_after_loading_already_called = False if model_device is not None: - assert model_device == _device, ( - "Expecting all weights " - "to be in the same device for now, got both: " - f"{model_device} and {_device}" - ) + process_weights_after_loading(model, model_config, model_device) else: - model_device = _device - - if name in existing_param_names: - module_name, weight_name = name.rsplit(".", 1) - module = named_modules[module_name] - setattr( - module, - weight_name, - torch.nn.Parameter(torch.empty(_shape, dtype=_dtype, device=_device)), + logger.warning_once( + "model_device is None, skip calling process_weights_after_loading" ) - # recorded_weight_attr is - # {"weight_name": {"weight_attr_key": attr}} - # e.g. - # { - # { - # "layer.0.weight": { - # "weight_loader": weight_loader_function_object, - # "input_dim": 0, ... - # }, - # "layer.1.weight": ..., - # } - # } - for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items(): - for attr_name, attr in weight_attr_dict.items(): - module_name, weight_name = full_weight_name.rsplit(".", 1) - module = named_modules[module_name] - weight = getattr(module, weight_name) - if not hasattr(weight, attr_name): - setattr(weight, attr_name, _bond_method_to_cls(attr, weight)) - - # Step I1: reload bfloat16 / high precision weights - loaded_weights = model.load_weights( - model_loader.get_all_weights(model_config, model) - ) - - # Step I2: online quantize the weights - # manually process weights after loading - model.process_weights_after_loading_already_called = False - process_weights_after_loading(model, model_config, model_device) - model.process_weights_after_loading_already_called = True - return loaded_weights + # Step R5 (workaround for cudagraph): restore the original quantized weights + # and do a copy_ of the currents weights to the original weights + updated_quantized_weights = dict(model.named_parameters(remove_duplicate=False)) + for name in model.original_weights_rebuild_keys: + if name in original_quantized_weight_dict: + original_quantized_weight = original_quantized_weight_dict[name] + updated_quantized_weight = updated_quantized_weights[name] + + module_name, weight_name = name.rsplit(".", 1) + module = named_modules[module_name] + setattr(module, weight_name, original_quantized_weight) + with torch.no_grad(): + original_quantized_weight.copy_(updated_quantized_weight) + + del original_quantized_weight_dict + del named_modules + del updated_quantized_weight + + model.process_weights_after_loading_already_called = True + return updated_params + + return patched_model_load_weights diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index ba708a098c0d..e74434e9d12c 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -88,6 +88,14 @@ def initialize_model( def process_weights_after_loading( model: nn.Module, model_config: ModelConfig, target_device: torch.device ) -> None: + if getattr(model, "process_weights_after_loading_already_called", False): + # In case `process_weights_after_loading` is called multiple times + # we'll skip it at later times + logger.debug_once( + "process_weights_after_loading already called for model %s", model + ) + return + # to avoid circular dependency from vllm.model_executor.model_loader.online_quantization import ( maybe_save_metadata_and_attributes_for_weight_reloading, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ca5af358e2ee..ccefd7e66697 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) +from vllm.model_executor.model_loader.online_quantization import ( + support_quantized_model_reload_from_hp_weights, +) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import supports_any_eagle from vllm.multimodal import NestedTensors @@ -316,6 +319,7 @@ def _load_module( ) raise ValueError(msg) + @support_quantized_model_reload_from_hp_weights def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]], From 40b6b38f2c8f8df1dbc145b48df99575f191014f Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 18 Nov 2025 18:10:02 -0800 Subject: [PATCH 169/578] [Core] Switch Flat logprob control from environment variable to SamplingParams (#28914) Signed-off-by: Jialin Ouyang Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/samplers/test_logprobs.py | 3 +-- tests/test_logprobs.py | 32 ++++++++++---------------------- vllm/envs.py | 6 ------ vllm/logprobs.py | 10 ++++------ vllm/sampling_params.py | 6 ++++++ vllm/v1/engine/logprobs.py | 17 ++++++++++++----- 6 files changed, 33 insertions(+), 41 deletions(-) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index c9d227599cde..ea40c4802720 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -24,9 +24,7 @@ def test_ranks( greedy, flat_logprobs, example_prompts, - monkeypatch: pytest.MonkeyPatch, ): - monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0") with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model: tokenizer = vllm_model.llm.get_tokenizer() example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts] @@ -36,6 +34,7 @@ def test_ranks( max_tokens=MAX_TOKENS, logprobs=NUM_TOP_LOGPROBS, prompt_logprobs=NUM_PROMPT_LOGPROBS, + flat_logprobs=flat_logprobs, ) results = vllm_model.generate_w_logprobs(example_prompts, sampling_params) diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py index d26a460d2bca..75e9d337aa24 100644 --- a/tests/test_logprobs.py +++ b/tests/test_logprobs.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - from vllm.logprobs import ( FlatLogprobs, Logprob, @@ -14,24 +12,20 @@ ) -def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0") - - prompt_logprobs = create_prompt_logprobs() +def test_create_logprobs_non_flat() -> None: + prompt_logprobs = create_prompt_logprobs(flat_logprobs=False) assert isinstance(prompt_logprobs, list) # Ensure first prompt position logprobs is None assert len(prompt_logprobs) == 1 assert prompt_logprobs[0] is None - sample_logprobs = create_sample_logprobs() + sample_logprobs = create_sample_logprobs(flat_logprobs=False) assert isinstance(sample_logprobs, list) assert len(sample_logprobs) == 0 -def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1") - - prompt_logprobs = create_prompt_logprobs() +def test_create_logprobs_flat() -> None: + prompt_logprobs = create_prompt_logprobs(flat_logprobs=True) assert isinstance(prompt_logprobs, FlatLogprobs) assert prompt_logprobs.start_indices == [0] assert prompt_logprobs.end_indices == [0] @@ -43,7 +37,7 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None: assert len(prompt_logprobs) == 1 assert prompt_logprobs[0] == dict() - sample_logprobs = create_sample_logprobs() + sample_logprobs = create_sample_logprobs(flat_logprobs=True) assert isinstance(sample_logprobs, FlatLogprobs) assert len(sample_logprobs.start_indices) == 0 assert len(sample_logprobs.end_indices) == 0 @@ -54,11 +48,8 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None: assert len(sample_logprobs) == 0 -def test_append_logprobs_for_next_position_none_flat( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0") - logprobs = create_sample_logprobs() +def test_append_logprobs_for_next_position_none_flat() -> None: + logprobs = create_sample_logprobs(flat_logprobs=False) append_logprobs_for_next_position( logprobs, token_ids=[1], @@ -85,11 +76,8 @@ def test_append_logprobs_for_next_position_none_flat( ] -def test_append_logprobs_for_next_position_flat( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1") - logprobs = create_sample_logprobs() +def test_append_logprobs_for_next_position_flat() -> None: + logprobs = create_sample_logprobs(flat_logprobs=True) append_logprobs_for_next_position( logprobs, token_ids=[1], diff --git a/vllm/envs.py b/vllm/envs.py index 6bf05803e14e..62b3344ccd85 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -225,7 +225,6 @@ VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" - VLLM_FLAT_LOGPROBS: bool = False def get_default_cache_root(): @@ -1499,11 +1498,6 @@ def get_vllm_port() -> int | None: "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices( "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"] ), - # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than - # the original list[dict[int, Logprob]] approach. - # After enabled, PromptLogprobs and SampleLogprobs would populated as - # FlatLogprobs. - "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))), } # --8<-- [end:env-vars-definition] diff --git a/vllm/logprobs.py b/vllm/logprobs.py index a34398db2c96..6a820308f523 100644 --- a/vllm/logprobs.py +++ b/vllm/logprobs.py @@ -5,8 +5,6 @@ from dataclasses import dataclass, field from typing import overload -import vllm.envs as envs - # We use dataclass for now because it is used for # openai server output, and msgspec is not serializable. @@ -161,17 +159,17 @@ def __iter__(self) -> Iterator[LogprobsOnePosition]: SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition] -def create_prompt_logprobs() -> PromptLogprobs: +def create_prompt_logprobs(flat_logprobs: bool) -> PromptLogprobs: """Creates a container to store prompt logprobs for a request""" - logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else [] + logprobs = FlatLogprobs() if flat_logprobs else [] # NOTE: logprob of first prompt token is None. logprobs.append(None) return logprobs -def create_sample_logprobs() -> SampleLogprobs: +def create_sample_logprobs(flat_logprobs: bool) -> SampleLogprobs: """Creates a container to store decode logprobs for a request""" - return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else [] + return FlatLogprobs() if flat_logprobs else [] def append_logprobs_for_next_position( diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 901d66163452..0fb1d67687c8 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -204,6 +204,12 @@ class SamplingParams( prompt_logprobs: int | None = None """Number of log probabilities to return per prompt token. When set to -1, return all `vocab_size` log probabilities.""" + flat_logprobs: bool = False + """Whether to return logprobs in flatten format (i.e. FlatLogprob) + for better performance. + NOTE: GC costs of FlatLogprobs is significantly smaller than + list[dict[int, Logprob]]. After enabled, PromptLogprobs and + SampleLogprobs would populated as FlatLogprobs.""" # NOTE: This parameter is only exposed at the engine level for now. # It is not exposed in the OpenAI API server, as the OpenAI API does # not support returning only a list of token IDs. diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index b618d2347265..63064a2c65d6 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -43,15 +43,22 @@ def from_new_request( tokenizer: AnyTokenizer | None, request: EngineCoreRequest, ) -> "LogprobsProcessor": - assert request.sampling_params is not None - num_logprobs = request.sampling_params.logprobs - num_prompt_logprobs = request.sampling_params.prompt_logprobs + sampling_params = request.sampling_params + assert sampling_params is not None + num_logprobs = sampling_params.logprobs + num_prompt_logprobs = sampling_params.prompt_logprobs return cls( tokenizer=tokenizer, cumulative_logprob=(None if num_logprobs is None else 0.0), - logprobs=(None if num_logprobs is None else create_sample_logprobs()), + logprobs=( + None + if num_logprobs is None + else create_sample_logprobs(sampling_params.flat_logprobs) + ), prompt_logprobs=( - None if num_prompt_logprobs is None else create_prompt_logprobs() + None + if num_prompt_logprobs is None + else create_prompt_logprobs(sampling_params.flat_logprobs) ), num_prompt_logprobs=num_prompt_logprobs, num_logprobs=num_logprobs, From 20852c8f4c10d80204c47e0cb85f5b252ff51c86 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 19 Nov 2025 10:32:00 +0800 Subject: [PATCH 170/578] [CPU] Refactor CPU WNA16 (#28826) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 11 +- cmake/cpu_extension.cmake | 1 + csrc/cpu/cpu_attn_impl.hpp | 2 +- csrc/cpu/cpu_types_x86.hpp | 47 +- csrc/cpu/cpu_wna16.cpp | 402 +++++++++++ csrc/cpu/dnnl_helper.cpp | 6 +- csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp | 245 +++++++ csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp | 91 +++ csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp | 115 ++++ csrc/cpu/torch_bindings.cpp | 16 + csrc/cpu/utils.hpp | 55 ++ docs/getting_started/installation/cpu.md | 4 +- requirements/cpu.txt | 1 - tests/quantization/test_cpu_wna16.py | 23 + vllm/_custom_ops.py | 25 + vllm/config/model.py | 2 + vllm/envs.py | 5 - .../layers/fused_moe/cpu_fused_moe.py | 49 -- .../fused_moe/unquantized_fused_moe_method.py | 2 +- .../layers/quantization/__init__.py | 5 + .../layers/quantization/cpu_wna16.py | 625 ++++++++++++++++++ .../layers/quantization/ipex_quant.py | 2 +- 22 files changed, 1656 insertions(+), 78 deletions(-) create mode 100644 csrc/cpu/cpu_wna16.cpp create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp create mode 100644 csrc/cpu/utils.hpp create mode 100644 tests/quantization/test_cpu_wna16.py create mode 100644 vllm/model_executor/layers/quantization/cpu_wna16.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 7479c43977d7..2267718f75ca 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -73,12 +73,11 @@ function cpu_tests() { pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" - # Note: disable it until supports V1 - # Run AWQ test - # docker exec cpu-test-"$NUMA_NODE" bash -c " - # set -e - # pytest -x -s -v \ - # tests/quantization/test_ipex_quant.py" + # Run AWQ/GPTQ test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -s -v \ + tests/quantization/test_cpu_wna16.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index aa84125818d1..fbbb03c5ed46 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -375,6 +375,7 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" + "csrc/cpu/cpu_wna16.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) set(VLLM_EXT_SRC diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 344296528b65..294b4f714a76 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -1,7 +1,6 @@ #ifndef CPU_ATTN_HPP #define CPU_ATTN_HPP -#include #include #include @@ -12,6 +11,7 @@ #include "cpu_types.hpp" #include "scratchpad_manager.h" #include "cpu_attn_macros.h" +#include "utils.hpp" namespace cpu_attention { enum class ISA { AMX, VEC, VEC16 }; diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 7ddf028e6e13..6f51277f7844 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -104,6 +104,8 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {} + explicit FP16Vec16(const FP32Vec16&); void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } @@ -141,6 +143,8 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {} + explicit BF16Vec16(const FP32Vec16&); void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } @@ -350,6 +354,22 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(__m512 data) : reg(data) {} + // de-pack 4 bit values + explicit FP32Vec16(int64_t value, const FP32Vec16& lut) { + int64_t mask_0 = 0x0F0F0F0F0F0F0F0F; + int64_t mask_1 = 0xF0F0F0F0F0F0F0F0; + int64_t value_0 = value & mask_0; + int64_t value_1 = value & mask_1; + __m128i vec_0 = _mm_movpi64_epi64((__m64)value_0); + __m128i vec_1 = _mm_movpi64_epi64((__m64)value_1); + vec_0 = _mm_cvtepu8_epi16(vec_0); + vec_1 = _mm_cvtepu8_epi16(vec_1); + vec_1 = _mm_slli_epi16(vec_1, 4); + __m128i vec = _mm_or_si128(vec_0, vec_1); + __m512i vec_i32 = _mm512_cvtepu8_epi32(vec); + reg = _mm512_permutexvar_ps(vec_i32, lut.reg); + } + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( @@ -426,14 +446,6 @@ struct FP32Vec16 : public Vec { float get_last_elem() const { return _mm512_cvtss_f32(reg); } - template - float reduce_sub_sum(int idx) { - static_assert(VEC_ELEM_NUM % group_size == 0); - constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); - __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); - return _mm512_mask_reduce_add_ps(mask, reg); - } - void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { @@ -755,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) { inline void non_temporal_save(FP32Vec16& vec, void* ptr) { _mm512_stream_ps((float*)ptr, vec.reg); } + +static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1, + void* ptr) { + __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg); + __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg); + vec_1 = _mm512_slli_epi32(vec_1, 16); + vec_0 = _mm512_or_si512(vec_0, vec_1); + _mm512_storeu_epi32(ptr, vec_0); +} + +static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1, + void* ptr) { + __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg); + __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg); + vec_1 = _mm512_slli_epi32(vec_1, 16); + vec_0 = _mm512_or_si512(vec_0, vec_1); + _mm512_storeu_epi32(ptr, vec_0); +} + #endif inline void mem_barrier() { _mm_mfence(); } diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp new file mode 100644 index 000000000000..816d195506e5 --- /dev/null +++ b/csrc/cpu/cpu_wna16.cpp @@ -0,0 +1,402 @@ +#include "cpu_types.hpp" +#include "scratchpad_manager.h" +#include "utils.hpp" + +#ifdef CPU_CAPABILITY_AMXBF16 + #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp" +#endif +#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp" + +#define VLLM_DISPATCH_CASE_16B_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) + +#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__)) + +template +void print_logits(const char* name, T* ptr, int32_t row, int32_t col, + int32_t stride) { + std::stringstream ss; + ss << std::fixed << std::setprecision(5) << name << ": [\n"; + auto* curr_logits_buffer = ptr; + for (int32_t m = 0; m < row; ++m) { + for (int32_t n = 0; n < col; ++n) { + ss << curr_logits_buffer[n] << ", "; + } + ss << "\n"; + curr_logits_buffer += stride; + } + ss << "]\n"; + std::printf("%s", ss.str().c_str()); +} + +namespace { +using cpu_utils::ISA; +using cpu_utils::VecTypeTrait; + +template +class Dequantizer4b { + public: + constexpr static int32_t pack_num = 32 / 4; + using scalar_vec_t = typename VecTypeTrait::vec_t; + + public: + static void dequant(int32_t* __restrict__ q_weight, + scalar_t* __restrict__ weight, + scalar_t* __restrict__ scales, + int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx, + const int64_t scales_stride, const int64_t zeros_stride, + const int32_t k_size, const int32_t group_size) { + vec_op::FP32Vec16 lut; + if constexpr (has_zp) { + // AWQ + alignas(64) static const float LUT[16] = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + lut = vec_op::FP32Vec16(LUT); + } else { + // GPTQ + alignas(64) static const float LUT[16] = { + -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + lut = vec_op::FP32Vec16(LUT); + } + + // per 64-bits elem contains 16 output channels + int64_t* __restrict__ curr_q_weight = reinterpret_cast(q_weight); + int64_t* __restrict__ curr_zeros = reinterpret_cast(zeros); + scalar_t* __restrict__ curr_weight = weight; + scalar_t* __restrict__ curr_scale = scales; + vec_op::FP32Vec16 scale_0; + vec_op::FP32Vec16 scale_1; + vec_op::FP32Vec16 zero_0; + vec_op::FP32Vec16 zero_1; + int32_t group_counter = 0; + for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) { + int64_t qwb_0 = *curr_q_weight; + int64_t qwb_1 = *(curr_q_weight + 1); + vec_op::FP32Vec16 wb_0(qwb_0, lut); + vec_op::FP32Vec16 wb_1(qwb_1, lut); + + if constexpr (!use_desc_act) { + if (group_counter == 0) { + scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale)); + scale_1 = vec_op::FP32Vec16(scale_0); + curr_scale += scales_stride; + + if constexpr (has_zp) { + zero_0 = vec_op::FP32Vec16(*curr_zeros, lut); + zero_1 = vec_op::FP32Vec16(zero_0); + curr_zeros += zeros_stride / 2; + } + } + } else { + int32_t g_idx_0 = g_idx[k_idx]; + int32_t g_idx_1 = g_idx[k_idx + 1]; + scale_0 = vec_op::FP32Vec16( + scalar_vec_t(curr_scale + g_idx_0 * scales_stride)); + scale_1 = vec_op::FP32Vec16( + scalar_vec_t(curr_scale + g_idx_1 * scales_stride)); + if constexpr (has_zp) { + zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2), + lut); + zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2), + lut); + } + } + + if constexpr (has_zp) { + wb_0 = wb_0 - zero_0; + wb_1 = wb_1 - zero_1; + } + + wb_0 = wb_0 * scale_0; + wb_1 = wb_1 * scale_1; + + scalar_vec_t output_vec_0(wb_0); + scalar_vec_t output_vec_1(wb_1); + + // AMX needs to interlave K elements to pack as 32 bits + if constexpr (isa == ISA::AMX) { + vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight); + } else { + output_vec_0.save(curr_weight); + output_vec_1.save(curr_weight + 16); + } + + // update + curr_q_weight += 2; + curr_weight += 32; + if constexpr (!use_desc_act) { + group_counter += 2; + if (group_counter == group_size) { + group_counter = 0; + } + } + } + } +}; +}; // namespace + +template +void cpu_gemm_wna16_impl( + scalar_t* __restrict__ input, int32_t* __restrict__ q_weight, + scalar_t* __restrict__ output, scalar_t* __restrict__ scales, + int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx, + scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size, + const int32_t k_size, const int64_t input_stride, + const int64_t output_stride, const int64_t scales_group_stride, + const int64_t zeros_group_stride, const int32_t group_num, + const int32_t group_size, const int64_t pack_factor) { + constexpr int32_t gemm_n_tile_size = gemm_t::NSize; + constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize; + constexpr int32_t n_block_size = 16; + static_assert(gemm_n_tile_size % n_block_size == 0); + const int32_t thread_num = omp_get_max_threads(); + + // a simple schedule policy, just to hold more B tiles in L2 and make sure + // each thread has tasks + const int32_t n_partition_size = [&]() { + const int64_t cache_size = cpu_utils::get_l2_size(); + int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t)); + int64_t ps_thread_limit = n_size / thread_num; + ps_cache_limit = + std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size, + (int64_t)gemm_n_tile_size); + ps_thread_limit = + std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size, + (int64_t)gemm_n_tile_size); + return std::min(ps_cache_limit, ps_thread_limit); + }(); + const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size; + + // get buffer size + const int64_t b_buffer_size = + (((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64); + const int64_t c_buffer_size = + (((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64); + const int64_t b_buffer_offset = 0; + const int64_t c_buffer_offset = b_buffer_size; + const int64_t buffer_size = b_buffer_size + c_buffer_size; + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size * + thread_num); + + alignas(64) cpu_utils::Counter counter; + cpu_utils::Counter* counter_ptr = &counter; + +#pragma omp parallel for schedule(static, 1) + for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) { + scalar_t* __restrict__ b_buffer = nullptr; + float* __restrict__ c_buffer = nullptr; + { + uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager() + ->get_data() + + thread_id * buffer_size; + b_buffer = reinterpret_cast(buffer_ptr + b_buffer_offset); + c_buffer = reinterpret_cast(buffer_ptr + c_buffer_offset); + } + + const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size; + const int64_t b_buffer_block_stride = n_block_size * k_size; + const int32_t zeros_block_stride = n_block_size / pack_factor; + + gemm_t gemm; + + for (;;) { + int32_t task_id = counter_ptr->acquire_counter(); + + if (task_id >= task_num) { + break; + } + + const int32_t n_start_idx = task_id * n_partition_size; + const int32_t n_block_start_idx = n_start_idx / n_block_size; + const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx); + const int32_t n_block_num = n_num / n_block_size; + // std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n", + // thread_id, task_id, n_start_idx, n_num); + + // dequant weight + { + int32_t* __restrict__ curr_q_weight = + q_weight + n_block_start_idx * q_weight_block_stride; + scalar_t* __restrict__ curr_b_buffer = b_buffer; + scalar_t* __restrict__ curr_scales = scales + n_start_idx; + int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor; + for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) { + dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales, + curr_zeros, g_idx, scales_group_stride, + zeros_group_stride, k_size, group_size); + + // if (block_idx == 0 && n_start_idx == 0) { + // print_logits("depacked weight", curr_b_buffer, k_size, + // n_block_size, n_block_size); + // } + + // update + curr_q_weight += q_weight_block_stride; + curr_b_buffer += b_buffer_block_stride; + curr_scales += n_block_size; + curr_zeros += zeros_block_stride; + } + } + + // compute loop + { + const int32_t n_tile_num = n_num / gemm_n_tile_size; + scalar_t* __restrict__ curr_input = input; + scalar_t* __restrict__ init_bias = bias; + if (bias != nullptr) { + init_bias += n_start_idx; + } + scalar_t* __restrict__ init_output = output + n_start_idx; + for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) { + const int32_t curr_m_size = + std::min(gemm_m_tile_size, m_size - m_idx); + scalar_t* __restrict__ curr_b_buffer = b_buffer; + scalar_t* __restrict__ curr_bias = init_bias; + scalar_t* __restrict__ curr_output = init_output; + for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) { + gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size, + input_stride, b_buffer_block_stride, gemm_n_tile_size, + false); + + if (bias != nullptr) { + cpu_micro_gemm::bias_epilogue( + c_buffer, curr_output, curr_bias, curr_m_size, + gemm_n_tile_size, output_stride); + curr_bias += gemm_n_tile_size; + } else { + cpu_micro_gemm::default_epilogue( + c_buffer, curr_output, curr_m_size, gemm_n_tile_size, + output_stride); + } + + curr_b_buffer += + b_buffer_block_stride * (gemm_n_tile_size / n_block_size); + curr_output += gemm_n_tile_size; + } + curr_input += gemm_m_tile_size * input_stride; + init_output += gemm_m_tile_size * output_stride; + } + } + } + } +} + +void cpu_gemm_wna16( + const torch::Tensor& input, // [M, K] + const torch::Tensor& + q_weight, // [N / 16, K * 16 / pack_factor], packed as int32 + torch::Tensor& output, // [M, N] + const torch::Tensor& scales, // [group_num, N] + const std::optional& + zeros, // [group_num, N / pack_factor], packed as int32 + const std::optional& g_idx, // [K] + const std::optional& bias, // [N] + const int64_t pack_factor, const std::string& isa_hint) { + using cpu_utils::ISA; + TORCH_CHECK_EQ(pack_factor, 8); // only supports 4bits + const int32_t a_m_size = input.size(0); + const int32_t a_k_size = input.size(1); + const int64_t a_m_stride = input.stride(0); + const int32_t b_n_size = q_weight.size(0) * 16; + TORCH_CHECK_EQ(a_k_size % 32, 0); + TORCH_CHECK_EQ(b_n_size % 32, 0); + const int32_t group_num = scales.size(0); + const int32_t group_size = a_k_size / group_num; + TORCH_CHECK_EQ(group_size % 2, 0); + const int64_t scales_group_stride = scales.stride(0); + const int64_t output_m_stride = output.stride(0); + + bool has_zp = zeros.has_value(); + bool use_desc_act = g_idx.has_value(); + TORCH_CHECK(!(has_zp && use_desc_act)); + + ISA isa = [&]() { + if (isa_hint == "amx") { + return ISA::AMX; + } else if (isa_hint == "vec") { + return ISA::VEC; + } else { + TORCH_CHECK(false, "unsupported isa hint: " + isa_hint); + } + }(); + + int32_t* zeros_ptr = has_zp ? zeros->data_ptr() : nullptr; + const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0; + int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr() : nullptr; + + VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() { + if (isa == ISA::AMX) { + using gemm_t = cpu_micro_gemm::MicroGemm; + if (has_zp) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + if (use_desc_act) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } else { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + } else if (isa == ISA::VEC) { + using gemm_t = cpu_micro_gemm::MicroGemm; + if (has_zp) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + if (use_desc_act) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } else { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + } + }); +} diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index 02a8072ccf30..cfb6e78cba9a 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -396,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) : DNNLMatMulPrimitiveHandler( static_cast(args), args.ab_type), m_size_cache_(nullptr) { - assert(ab_type_ == dnnl::memory::data_type::f32 || - ab_type_ == dnnl::memory::data_type::bf16 || - ab_type_ == dnnl::memory::data_type::f16); + assert(b_type_ == dnnl::memory::data_type::f32 || + b_type_ == dnnl::memory::data_type::bf16 || + b_type_ == dnnl::memory::data_type::f16); dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, {b_k_stride_, b_n_stride_}); diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp new file mode 100644 index 000000000000..87a019773a89 --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp @@ -0,0 +1,245 @@ +#ifndef CPU_MICRO_GEMM_AMX_HPP +#define CPU_MICRO_GEMM_AMX_HPP +#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp" + +namespace cpu_micro_gemm { +namespace { +// AMX specific +constexpr static int64_t AMX_TILE_ROW_BYTES = 64; +constexpr static int64_t AMX_TILE_ROW_NUM = 16; +constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM; + +typedef struct __tile_config { + uint8_t palette_id = 1; + uint8_t start_row = 0; + uint8_t reserved_0[14] = {0}; + uint16_t colsb[16] = {0}; + uint8_t rows[16] = {0}; +} __tilecfg; + +// 2-2-4 pattern, for 16 < m <= 32 +// TILE 0, 1: load A matrix, row num should be 16, m - 16 +// TILE 2, 3: load B matrix, row num should be 16 +// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m +// - 16 +template +class TileGemm224 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unsupported data type for TileGemm224"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported data type for TileGemm224"); + } +}; + +template <> +class TileGemm224 { + public: + using scalar_t = c10::BFloat16; + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + c10::BFloat16* __restrict__ a_tile_0 = a_ptr; + c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM; + const int64_t a_tile_stride = lda * sizeof(c10::BFloat16); + + // B is always packed as 16 output channels block + c10::BFloat16* __restrict__ b_tile_2 = b_ptr; + c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride; + const int32_t b_tile_stride = AMX_TILE_ROW_BYTES; + + float* __restrict__ c_tile_4 = c_ptr; + float* __restrict__ c_tile_5 = + c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float); + float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc; + float* __restrict__ c_tile_7 = + c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float); + const int32_t c_tile_stride = ldc * sizeof(float); + + if (accum_c) { + _tile_loadd(4, c_tile_4, c_tile_stride); + _tile_loadd(5, c_tile_5, c_tile_stride); + _tile_loadd(6, c_tile_6, c_tile_stride); + _tile_loadd(7, c_tile_7, c_tile_stride); + } else { + _tile_zero(4); + _tile_zero(5); + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_tile_stride); + _tile_dpbf16ps(4, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_tile_stride); + _tile_dpbf16ps(5, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_dpbf16ps(6, 1, 2); + _tile_dpbf16ps(7, 1, 3); + + // update ptrs + a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + _tile_stored(4, c_tile_4, c_tile_stride); + _tile_stored(5, c_tile_5, c_tile_stride); + _tile_stored(6, c_tile_6, c_tile_stride); + _tile_stored(7, c_tile_7, c_tile_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + const int32_t m_0 = AMX_TILE_ROW_NUM; + const int32_t m_1 = m - AMX_TILE_ROW_NUM; + config.rows[0] = m_0; + config.rows[1] = m_1; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = m_0; + config.rows[5] = m_0; + config.rows[6] = m_1; + config.rows[7] = m_1; + _tile_loadconfig(&config); + } +}; + +// 1-2-2 pattern, for 0 < m <= 16 +// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be +// m, m +// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row +// num should be 16 +// TILE 6, 7, (6, 7): store results C matrix, row num should be +// m +template +class TileGemm122 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unsupported data type for TileGemm122"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported data type for TileGemm122"); + } +}; + +template <> +class TileGemm122 { + public: + using scalar_t = c10::BFloat16; + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + c10::BFloat16* __restrict__ a_tile_0 = a_ptr; + c10::BFloat16* __restrict__ a_tile_1 = + a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + const int64_t a_tile_stride = lda * sizeof(c10::BFloat16); + + c10::BFloat16* __restrict__ b_tile_2 = b_ptr; + c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride; + c10::BFloat16* __restrict__ b_tile_4 = + b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + c10::BFloat16* __restrict__ b_tile_5 = + b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + int64_t b_stride = AMX_TILE_ROW_BYTES; + + float* __restrict__ c_tile_6 = c_ptr; + float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float); + int64_t c_stride = ldc * sizeof(float); + + const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + const int32_t k_group_times = k_times / 2; + const bool has_tail = (k_times % 2 == 1); + + if (accum_c) { + _tile_loadd(6, c_tile_6, c_stride); + _tile_loadd(7, c_tile_7, c_stride); + } else { + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_group_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_stream_loadd(4, b_tile_4, b_stride); + _tile_dpbf16ps(6, 1, 4); + _tile_stream_loadd(5, b_tile_5, b_stride); + _tile_dpbf16ps(7, 1, 5); + + // update ptrs + a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + if (has_tail) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + } + + _tile_stored(6, c_tile_6, c_stride); + _tile_stored(7, c_tile_7, c_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + config.rows[0] = m; + config.rows[1] = m; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = AMX_TILE_ROW_NUM; + config.rows[5] = AMX_TILE_ROW_NUM; + config.rows[6] = m; + config.rows[7] = m; + _tile_loadconfig(&config); + } +}; +} // namespace + +// Gemm kernel uses AMX, requires B matrix to be packed +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 32; + static constexpr int32_t NSize = 32; + + public: + MicroGemm() : curr_m_(-1) { + vec_op::unroll_loop([&](int i) { amx_tile_config_.colsb[i] = 64; }); + } + + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + if (m > AMX_TILE_ROW_NUM) { + if (m != curr_m_) { + curr_m_ = m; + TileGemm224::init_tile_config(m, amx_tile_config_); + } + TileGemm224::gemm(CPU_MICRO_GEMM_PARAMS); + } else { + if (m != curr_m_) { + curr_m_ = m; + TileGemm122::init_tile_config(m, amx_tile_config_); + } + TileGemm122::gemm(CPU_MICRO_GEMM_PARAMS); + } + } + + private: + alignas(64) __tilecfg amx_tile_config_; + int32_t curr_m_; +}; + +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp new file mode 100644 index 000000000000..784da55a420e --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp @@ -0,0 +1,91 @@ +#ifndef CPU_MICRO_GEMM_IMPL_HPP +#define CPU_MICRO_GEMM_IMPL_HPP +#include "cpu/utils.hpp" +#include "cpu/cpu_types.hpp" + +namespace cpu_micro_gemm { +#define DEFINE_CPU_MICRO_GEMM_PARAMS \ + scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr, \ + float *__restrict__ c_ptr, const int32_t m, const int32_t k, \ + const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \ + const bool accum_c + +#define CPU_MICRO_GEMM_PARAMS \ + a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c + +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 16; + static constexpr int32_t NSize = 16; + + public: + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unimplemented MicroGemm."); + } +}; + +template +FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr, + scalar_t* __restrict__ d_ptr, + const int32_t m, const int64_t ldc, + const int64_t ldd) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + static_assert(n_size % 16 == 0); + + float* __restrict__ curr_c = c_ptr; + scalar_t* __restrict__ curr_d = d_ptr; + for (int32_t i = 0; i < m; ++i) { + float* __restrict__ curr_c_iter = curr_c; + scalar_t* __restrict__ curr_d_iter = curr_d; + vec_op::unroll_loop([&](int32_t n_g_idx) { + vec_op::FP32Vec16 c_vec_fp32(curr_c_iter); + scalar_vec_t c_vec(c_vec_fp32); + c_vec.save(curr_d_iter); + curr_c_iter += 16; + curr_d_iter += 16; + }); + curr_c += ldc; + curr_d += ldd; + } +} + +template +FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr, + scalar_t* __restrict__ d_ptr, + scalar_t* __restrict__ bias_ptr, + const int32_t m, const int64_t ldc, + const int64_t ldd) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + static_assert(n_size % 16 == 0); + constexpr int32_t n_group_num = n_size / 16; + static_assert(n_group_num <= 16); + + vec_op::FP32Vec16 bias_vecs[n_group_num]; + scalar_t* __restrict__ curr_bias = bias_ptr; + vec_op::unroll_loop([&](int32_t i) { + scalar_vec_t vec(curr_bias); + bias_vecs[i] = vec_op::FP32Vec16(vec); + curr_bias += 16; + }); + + float* __restrict__ curr_c = c_ptr; + scalar_t* __restrict__ curr_d = d_ptr; + for (int32_t i = 0; i < m; ++i) { + float* __restrict__ curr_c_iter = curr_c; + scalar_t* __restrict__ curr_d_iter = curr_d; + vec_op::unroll_loop([&](int32_t n_g_idx) { + vec_op::FP32Vec16 c_vec_fp32(curr_c_iter); + c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx]; + scalar_vec_t c_vec(c_vec_fp32); + c_vec.save(curr_d_iter); + curr_c_iter += 16; + curr_d_iter += 16; + }); + curr_c += ldc; + curr_d += ldd; + } +} +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp new file mode 100644 index 000000000000..3985c2f2e5fe --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp @@ -0,0 +1,115 @@ +#ifndef CPU_MICRO_GEMM_VEC_HPP +#define CPU_MICRO_GEMM_VEC_HPP +#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp" + +namespace cpu_micro_gemm { +namespace { +// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32] +template +class TileGemm82 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + switch (m) { + case 1: + gemm_micro<1>(CPU_MICRO_GEMM_PARAMS); + break; + case 2: + gemm_micro<2>(CPU_MICRO_GEMM_PARAMS); + break; + case 3: + gemm_micro<3>(CPU_MICRO_GEMM_PARAMS); + break; + case 4: + gemm_micro<4>(CPU_MICRO_GEMM_PARAMS); + break; + case 5: + gemm_micro<5>(CPU_MICRO_GEMM_PARAMS); + break; + case 6: + gemm_micro<6>(CPU_MICRO_GEMM_PARAMS); + break; + case 7: + gemm_micro<7>(CPU_MICRO_GEMM_PARAMS); + break; + case 8: + gemm_micro<8>(CPU_MICRO_GEMM_PARAMS); + break; + } + } + + template + static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) { + static_assert(0 < M <= 8); + using load_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + + scalar_t* __restrict__ curr_b_0 = b_ptr; + scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride; + float* __restrict__ curr_c_0 = c_ptr; + float* __restrict__ curr_c_1 = c_ptr + 16; + + vec_op::FP32Vec16 c_regs[M * 2]; + if (accum_c) { + float* __restrict__ curr_m_c_0 = curr_c_0; + float* __restrict__ curr_m_c_1 = curr_c_1; + vec_op::unroll_loop([&](int32_t i) { + c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0); + c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1); + + // update + curr_m_c_0 += ldc; + curr_m_c_1 += ldc; + }); + } + + scalar_t* __restrict__ curr_a = a_ptr; + for (int32_t k_idx = 0; k_idx < k; ++k_idx) { + load_vec_t b_0_reg(curr_b_0); + vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg); + load_vec_t b_1_reg(curr_b_1); + vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg); + + scalar_t* __restrict__ curr_m_a = curr_a; + vec_op::unroll_loop([&](int32_t i) { + scalar_t v = *curr_m_a; + load_vec_t a_reg_original(v); + vec_op::FP32Vec16 a_reg(a_reg_original); + c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg; + c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg; + + // update + curr_m_a += lda; + }); + + // update + curr_a += 1; + curr_b_0 += 16; + curr_b_1 += 16; + } + + vec_op::unroll_loop([&](int32_t i) { + c_regs[i * 2].save(curr_c_0); + c_regs[i * 2 + 1].save(curr_c_1); + + // update + curr_c_0 += ldc; + curr_c_1 += ldc; + }); + } +}; +} // namespace + +// Gemm kernel uses vector instructions, requires B matrix to be packed +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 8; + static constexpr int32_t NSize = 32; + + public: + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TileGemm82::gemm(CPU_MICRO_GEMM_PARAMS); + } +}; +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 9fefd88cd9b0..b07d20bab7dd 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -103,6 +103,13 @@ void cpu_attention_with_kv_cache( // Note: just for avoiding importing errors void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); } +void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight, + torch::Tensor& output, const torch::Tensor& scales, + const std::optional& zeros, + const std::optional& g_idx, + const std::optional& bias, + const int64_t pack_factor, const std::string& isa_hint); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -283,6 +290,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("static_scaled_fp8_quant() -> ()", placeholder_op); ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op); ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op); + + // WNA16 +#if defined(__AVX512F__) + ops.def( + "cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, " + "Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt " + "pack_factor, str isa_hint) -> ()"); + ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16); +#endif } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp new file mode 100644 index 000000000000..d8399c56f6af --- /dev/null +++ b/csrc/cpu/utils.hpp @@ -0,0 +1,55 @@ +#ifndef UTILS_HPP +#define UTILS_HPP + +#include +#include +#include +#include + +#include "cpu_types.hpp" + +namespace cpu_utils { +enum class ISA { AMX, VEC }; + +template +struct VecTypeTrait { + using vec_t = void; +}; + +template <> +struct VecTypeTrait { + using vec_t = vec_op::FP32Vec16; +}; + +template <> +struct VecTypeTrait { + using vec_t = vec_op::BF16Vec16; +}; + +template <> +struct VecTypeTrait { + using vec_t = vec_op::FP16Vec16; +}; + +struct Counter { + std::atomic counter; + char _padding[56]; + + Counter() : counter(0) {} + + void reset_counter() { counter.store(0); } + + int64_t acquire_counter() { return counter++; } +}; + +inline int64_t get_l2_size() { + static int64_t size = []() { + long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); + assert(l2_cache_size != -1); + return l2_cache_size >> 1; // use 50% of L2 cache + }(); + return size; +} +} // namespace cpu_utils + +#endif diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index be99cef3723e..d1beab7855b1 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -97,7 +97,6 @@ Currently, there are no pre-built CPU wheels. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable. - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`. - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence. -- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False). ## FAQ @@ -191,10 +190,9 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel - GPTQ (x86 only) - compressed-tensor INT8 W8A8 (x86, s390x) -### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`? +### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`? - Both of them require `amx` CPU flag. - - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios. ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker? diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d11787df4d92..e23d3286f3f7 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -22,7 +22,6 @@ datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs intel-openmp==2024.2.1; platform_machine == "x86_64" -intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64" triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile. # Use this to gather CPU info and optimize based on ARM Neoverse cores diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py new file mode 100644 index 000000000000..077b802e559d --- /dev/null +++ b/tests/quantization/test_cpu_wna16.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from vllm.platforms import current_platform + +if not current_platform.is_cpu(): + pytest.skip("skipping CPU-only tests", allow_module_level=True) + +MODELS = [ + "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ", + "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", # with g_idx +] +DTYPE = ["bfloat16"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", DTYPE) +def test_ipex_quant(vllm_runner, model, dtype): + with vllm_runner(model, dtype=dtype) as llm: + output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + assert output + print(output) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 096266c9764e..66cf6472eee4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2702,6 +2702,31 @@ def cpu_attention_with_kv_cache( ) +def cpu_gemm_wna16( + input: torch.Tensor, + q_weight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor | None, + g_idx: torch.Tensor | None, + bias: torch.Tensor | None, + pack_factor: int, + isa_hint: str, +) -> torch.Tensor: + output = torch.empty((input.size(0), scales.size(1)), dtype=input.dtype) + torch.ops._C.cpu_gemm_wna16( + input, + q_weight, + output, + scales, + zeros, + g_idx, + bias, + pack_factor, + isa_hint, + ) + return output + + if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"): @register_fake("_qutlass_C::matmul_mxf4_bf16_tn") diff --git a/vllm/config/model.py b/vllm/config/model.py index 49fe0bcd9a2a..3e8790a26e0e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1020,6 +1020,8 @@ def _verify_quantization(self) -> None: # Ensure heavy backends are probed last to avoid unnecessary # imports during override detection (e.g., MXFP4 imports Triton) "mxfp4", + "cpu_gptq", + "cpu_awq", ] quantization_methods = [ q for q in supported_quantization if q not in overrides diff --git a/vllm/envs.py b/vllm/envs.py index 62b3344ccd85..6d92d5afee50 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -50,7 +50,6 @@ VLLM_CPU_KVCACHE_SPACE: int | None = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None - VLLM_CPU_MOE_PREPACK: bool = True VLLM_CPU_SGL_KERNEL: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False @@ -665,10 +664,6 @@ def get_vllm_port() -> int | None: ) if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None, - # (CPU backend only) whether to use prepack for MoE layer. This will be - # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might - # need to set this to "0" (False). - "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), # (CPU backend only) whether to use SGL kernels, optimized for small batch. "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))), # If the env var is set, Ray Compiled Graph uses the specified diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 23ace3408562..572307052b48 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -6,7 +6,6 @@ from torch.nn import functional as F from vllm import _custom_ops as ops -from vllm import envs def silu_and_mul(x: torch.Tensor) -> torch.Tensor: @@ -130,54 +129,6 @@ def select_experts( ) -class IPEXFusedMOE: - def __init__(self, layer: torch.nn.Module) -> None: - import intel_extension_for_pytorch as ipex - - layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( - layer.w13_weight, - layer.w2_weight, - use_prepack=envs.VLLM_CPU_MOE_PREPACK, - ) - - def __call__( - self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - ) -> torch.Tensor: - assert activation == "silu", f"{activation} is not supported." - assert not apply_router_weight_on_input - assert routed_scaling_factor == 1.0, ( - f"routed_scaling_factor {routed_scaling_factor} is not supported." - ) - return layer.ipex_fusion( - x, - use_grouped_topk, - top_k, - router_logits, - renormalize, - topk_group, - num_expert_group, - custom_routing_function, - scoring_func, - e_score_correction_bias, - ) - - class SGLFusedMOE: def __init__(self, layer: torch.nn.Module) -> None: pass diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index ce56887f1c26..2e0376553b91 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -260,7 +260,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight.copy_(packed_w2_weight) layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer) else: - layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer) + layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) else: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index bb42b10f8718..18aaae394f93 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -38,6 +38,8 @@ "inc", "mxfp4", "petit_nvfp4", + "cpu_gptq", + "cpu_awq", ] QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) @@ -107,6 +109,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .compressed_tensors.compressed_tensors import ( CompressedTensorsConfig, ) + from .cpu_wna16 import CPUAWQConfig, CPUGPTQConfig from .deepspeedfp import DeepSpeedFPConfig from .experts_int8 import ExpertsInt8Config from .fbgemm_fp8 import FBGEMMFp8Config @@ -159,6 +162,8 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "inc": INCConfig, "mxfp4": Mxfp4Config, "petit_nvfp4": PetitNvFp4Config, + "cpu_gptq": CPUGPTQConfig, + "cpu_awq": CPUAWQConfig, } # Update the `method_to_config` with customized quantization methods. method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py new file mode 100644 index 000000000000..bf643f55f1b9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/cpu_wna16.py @@ -0,0 +1,625 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE + +from vllm._custom_ops import ( + cpu_gemm_wna16, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_linear_quant_method, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + marlin_repeat_scales_on_all_ranks, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + is_layer_skipped, + pack_cols, + unpack_cols, +) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.utils import WeightsMapper +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + RowvLLMParameter, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_safetensors_params_metadata +from vllm.utils.collection_utils import is_list_of + +logger = init_logger(__name__) + + +class CPUGPTQConfig(QuantizationConfig): + """Config class for CPU GPTQ quant""" + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + lm_head_quantized: bool, + dynamic: dict[str, dict[str, int | bool]], + full_config: dict[str, Any], + modules_in_block_to_quantize: list[str] | None = None, + ) -> None: + super().__init__() + if desc_act and group_size == -1: + # In this case, act_order == True is the same as act_order == False + # (since we have only one group per output channel) + desc_act = False + + # GPTQModel use `dynamic` config property to allow per module + # quantization config so each module can be individually optimized. + # Format is dict[str, dict] where key is a regex string that can + # perform both positive ("+:" prefixed) or negative ("-:" prefixed) + # matching of a module. + # Default to positive match, override base quant config mode, if no + # prefix is used. Value is in dict format of field key and override + # value. + # Negative matching will skip quantization init for this module + # entirely: + # non-quantized inference. More details and quantization examples can be + # found at: https://github.com/ModelCloud/GPTQModel + # Example: + # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 + # # last 1/4 of the layers 16-21 has 8bit and group_size 64 + # dynamic = { + # #`.*\.` matches the layers_node prefix + # # positive match layer 10-15 + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, + # # positive match layer 16-21 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, + # r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers + # } + assert weight_bits == 4 + self.dynamic = dynamic + self.weight_bits = weight_bits + self.is_sym = is_sym + self.pack_factor = 32 // weight_bits # packed into int32 + self.group_size = group_size + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized + self.full_config = full_config + self.modules_in_block_to_quantize = modules_in_block_to_quantize or [] + + def __repr__(self) -> str: + return ( + f"CPUWNA16Config(" + f"group_size={self.group_size}, " + f"desc_act={self.desc_act}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"dynamic={self.dynamic}, " + f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "cpu_gptq" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return -1 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "CPUGPTQConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False) + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + group_size = cls.get_from_keys(config, ["group_size"]) + is_sym = cls.get_from_keys(config, ["sym"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + modules_in_block_to_quantize = cls.get_from_keys_or( + config, ["modules_in_block_to_quantize"], default=None + ) + return cls( + weight_bits, + group_size, + desc_act, + is_sym, + lm_head_quantized, + dynamic, + config, + modules_in_block_to_quantize, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + quant_method = hf_quant_cfg.get("quant_method", "").lower() + if current_platform.is_cpu() and (quant_method == "gptq"): + return cls.get_name() + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + return get_linear_quant_method(self, layer, prefix, CPUGPTQLinearMethod) # type: ignore + + def apply_vllm_mapper(self, hf_to_vllm_mapper): + if self.modules_in_block_to_quantize is not None: + self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list( + self.modules_in_block_to_quantize + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_in_block_to_quantize: + if is_list_of(self.modules_in_block_to_quantize, list): + # original modules_in_block_to_quantize: list[list[str]] + # flatten original modules_in_block_to_quantize + self.modules_in_block_to_quantize = [ + item + for sublist in self.modules_in_block_to_quantize + for item in sublist + ] + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_in_block_to_quantize = list(quant_layers) + + +class CPUGPTQLinearMethod(LinearMethodBase): + """Linear method for GPTQ on CPU. + + Args: + quant_config: The CPUWNA16 quantization config. + """ + + def __init__(self, quant_config: CPUGPTQConfig) -> None: + self.quant_config = quant_config + assert self.quant_config.is_sym, "GPTQ asym quant is not supported on CPU" + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + output_size_per_partition = sum(output_partition_sizes) + assert output_size_per_partition * self.quant_config.weight_bits % 32 == 0 + assert output_size_per_partition % 32 == 0 + assert input_size_per_partition % 32 == 0 + + is_row_parallel = input_size != input_size_per_partition + weight_loader = extra_weight_attrs.get("weight_loader") + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + # Determine sharding + if marlin_repeat_scales_on_all_ranks( + self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel + ): + # By setting scale_dim == None, weight_loader will + # repeat the scales on each rank in TP>1 case. + scales_and_zp_input_dim = None + scales_and_zp_size = input_size // group_size + else: + # By setting scale_dim == 0, weight_loader will + # shard the scales in TP>1 case. + scales_and_zp_input_dim = 0 + scales_and_zp_size = input_size_per_partition // group_size + + # Quantized weights + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + # Activation order + g_idx = RowvLLMParameter( + data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader, + ) + set_weight_attrs( + g_idx, + {"ignore_warning": True}, + ) + + qzeros_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + "weight_loader": weight_loader, + } + weight_scale_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition, + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + + if scales_and_zp_input_dim is None: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + qzeros = PackedColumnParameter( + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + else: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + qzeros = PackedvLLMParameter( + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("g_idx", g_idx) + layer.register_parameter("scales", scales) + layer.register_parameter("qzeros", qzeros) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False) + packed_weight = layer.qweight.data + bits = self.quant_config.weight_bits + pack_factor = int(self.quant_config.pack_factor) + p_w_k, p_w_n = packed_weight.size() + input_size = p_w_k * pack_factor + output_size = p_w_n + isa_hint = _get_isa_hint(layer.scales.dtype) + layer.isa_hint = isa_hint + + layer.qzeros = None + if not self.quant_config.desc_act: + layer.g_idx = None + + # convert input dim packed to output dim packed + weight = unpack_cols(packed_weight, bits, p_w_k, p_w_n * pack_factor).view( + p_w_k, p_w_n, pack_factor + ) + weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous() + weight = pack_cols(weight, bits, input_size, output_size) + # make 16 output channel as a block and transpose to the make + # the block contigous + weight = ( + weight.view(input_size, -1, 16 // pack_factor) + .permute(1, 0, 2) + .reshape(-1, input_size * 16 // pack_factor) + .contiguous() + ) + layer.qweight.data = weight + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + x = cpu_gemm_wna16( + input=x, + q_weight=layer.qweight, + scales=layer.scales, + zeros=layer.qzeros, + g_idx=layer.g_idx, + bias=bias, + pack_factor=8, + isa_hint=layer.isa_hint, + ) + return x + + +class CPUAWQConfig(QuantizationConfig): + """Config class for CPU AWQ""" + + def __init__( + self, + weight_bits: int, + group_size: int, + zero_point: bool, + lm_head_quantized: bool, + modules_to_not_convert: list[str] | None, + full_config: dict[str, Any], + ) -> None: + super().__init__() + assert weight_bits == 4 + self.pack_factor = 32 // weight_bits # packed into int32 + self.group_size = group_size + self.zero_point = zero_point + self.lm_head_quantized = lm_head_quantized + self.weight_bits = weight_bits + self.modules_to_not_convert = modules_to_not_convert or [] + self.full_config = full_config + + def __repr__(self) -> str: + return ( + f"AWQMarlinConfig(" + f"group_size={self.group_size}, " + f"zero_point={self.zero_point}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"modules_to_not_convert={self.modules_to_not_convert})" + ) + + @classmethod + def get_name(cls) -> "QuantizationMethods": + return "cpu_awq" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return -1 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "CPUAWQConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls( + weight_bits, + group_size, + zero_point, + lm_head_quantized, + modules_to_not_convert, + config, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> Optional["QuantizationMethods"]: + quant_method = hf_quant_cfg.get("quant_method", "").lower() + if current_platform.is_cpu() and (quant_method == "awq"): + return cls.get_name() + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase) or ( + isinstance(layer, ParallelLMHead) and self.lm_head_quantized + ): + if is_layer_skipped( + prefix, + self.modules_to_not_convert, + self.packed_modules_mapping, + skip_with_substr=True, + ): + return UnquantizedLinearMethod() + return CPUAWQLinearMethod(self) + return None + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.modules_to_not_convert: + self.modules_to_not_convert = hf_to_vllm_mapper.apply_list( + self.modules_to_not_convert + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_to_not_convert: + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + layers = {param_name.rsplit(".", 1)[0] for param_name in metadata} + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_to_not_convert = list(layers - quant_layers) + + +class CPUAWQLinearMethod(LinearMethodBase): + """Linear method for CPU AWQ. + + Args: + quant_config: The CPU AWQ quantization config. + """ + + def __init__(self, quant_config: CPUAWQConfig) -> None: + self.quant_config = quant_config + assert self.quant_config.zero_point + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + del output_size + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + num_groups = input_size_per_partition // group_size + + qzeros = PackedvLLMParameter( + data=torch.empty( + num_groups, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + scales = GroupQuantScaleParameter( + data=torch.empty( + num_groups, + output_size_per_partition, + dtype=params_dtype, + ), + input_dim=0, + output_dim=1, + weight_loader=weight_loader, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("qzeros", qzeros) + layer.register_parameter("scales", scales) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False) + packed_weight = layer.qweight.data + packed_zeros = layer.qzeros.data + group_num = packed_zeros.size(0) + bits = self.quant_config.weight_bits + pack_factor = int(self.quant_config.pack_factor) + input_size, packed_output_size = packed_weight.size() + output_size = packed_output_size * pack_factor + isa_hint = _get_isa_hint(layer.scales.dtype) + layer.isa_hint = isa_hint + + interleave_map = (0, 4, 1, 5, 2, 6, 3, 7) + weight = unpack_cols( + packed_weight, + bits, + input_size, + output_size, + ) + zeros = unpack_cols( + packed_zeros, + bits, + group_num, + output_size, + ) + weight = ( + weight.view(input_size, -1, pack_factor)[:, :, interleave_map] + .reshape(input_size, output_size) + .contiguous() + ) + zeros = ( + zeros.view(group_num, -1, pack_factor)[:, :, interleave_map] + .reshape(group_num, output_size) + .contiguous() + ) + + zeros = pack_cols(zeros, bits, group_num, output_size).contiguous() + # make 16 output channel as a block and transpose to + # the make the block contigous + weight = pack_cols(weight, bits, input_size, output_size) + weight = ( + weight.view(input_size, -1, 16 // pack_factor) + .permute(1, 0, 2) + .reshape(-1, input_size * 16 // pack_factor) + .contiguous() + ) + layer.qweight.data = weight + layer.qzeros.data = zeros + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + x = cpu_gemm_wna16( + input=x, + q_weight=layer.qweight, + scales=layer.scales, + zeros=layer.qzeros, + g_idx=None, + bias=bias, + pack_factor=8, + isa_hint=layer.isa_hint, + ) + return x + + +def _get_isa_hint(dtype: torch.dtype) -> str: + supports_amx = torch._C._cpu._is_amx_tile_supported() + if supports_amx and dtype in (torch.bfloat16,): + return "amx" + else: + return "vec" diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 5ca9167faec8..22c4bae041a5 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -134,7 +134,7 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig": def override_quantization_method( cls, hf_quant_cfg, user_quant ) -> QuantizationMethods | None: - if not current_platform.is_cpu() and not current_platform.is_xpu(): + if not current_platform.is_xpu(): return None quant_method = hf_quant_cfg.get("quant_method", "").lower() From 814843e021a3618f7f8e494d4f0d4fd561cf3225 Mon Sep 17 00:00:00 2001 From: Strahinja Stamenkovic Date: Wed, 19 Nov 2025 04:12:31 +0100 Subject: [PATCH 171/578] Enable bitsandbytes quantization on AMD GPUs that use warp size 32 (#27307) Signed-off-by: sstamenk --- tests/models/quantization/test_bitsandbytes.py | 11 +++++++---- vllm/platforms/rocm.py | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index 24220978534c..dc4b4546e451 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -14,10 +14,13 @@ from ...utils import compare_two_settings, multi_gpu_test from ..utils import check_embeddings_close, check_logprobs_close -pytestmark = pytest.mark.skipif( - current_platform.is_rocm(), - reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)", -) +if current_platform.is_rocm(): + from vllm.platforms.rocm import on_gfx9 + + pytestmark = pytest.mark.skipif( + on_gfx9(), + reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)", + ) models_4bit_to_test = [ ("facebook/opt-125m", "quantize opt model inflight"), diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 788f9d69c357..bb116792fed5 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -185,6 +185,9 @@ class RocmPlatform(Platform): "petit_nvfp4", "torchao", ] + # bitsandbytes not supported on gfx9 (warp size 64 limitation) + if not on_gfx9(): + supported_quantization += ["bitsandbytes"] @classmethod def get_vit_attn_backend( From 4c23690f43e51eccf6ce5866ac47adcf39215e4d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 18 Nov 2025 23:06:21 -0500 Subject: [PATCH 172/578] [Attention] FlashAttention ViT support, make default backend (#28763) Signed-off-by: Matthew Bonanni --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- tests/kernels/attention/test_flash_attn.py | 4 +-- tests/kernels/attention/test_mha_attn.py | 30 +------------------ vllm/platforms/cuda.py | 21 ++++++------- vllm/v1/attention/backends/flash_attn.py | 4 +-- 5 files changed, 15 insertions(+), 46 deletions(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 567c8959f045..6cc5cda14c52 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 + GIT_TAG 71bb26f6295449be880344b93b51791cc009237d GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 6e5468969bf2..26b8c77ab482 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -13,14 +13,14 @@ ) NUM_HEADS = [(4, 4), (8, 2)] -HEAD_SIZES = [128, 256] +HEAD_SIZES = [40, 72, 80, 128, 256] BLOCK_SIZES = [16] DTYPES = [torch.bfloat16] QDTYPES = [None, torch.float8_e4m3fn] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] -SOFT_CAPS = [None, 50.0] +SOFT_CAPS = [None] SLIDING_WINDOWS = [None, 256] diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 183bbf3bf4e0..a878ac6396ce 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -62,38 +62,10 @@ def test_mha_attn_platform(device: str): assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN # Test CUDA with head_size=72 (not divisible by 32) - # - with upstream FA not available - # - should use xformers - with ( - patch("vllm.attention.layer.current_platform", CudaPlatform()), - patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), - patch( - "vllm.attention.layer.check_upstream_fa_availability", - return_value=False, - ), - ): - attn = MultiHeadAttention(16, 72, scale=1) - assert attn.attn_backend == AttentionBackendEnum.XFORMERS - - # Test CUDA with head_size=72 (not divisible by 32) - # - with upstream FA available - # - should use upstream FA + # - should use vLLM's FlashAttention with ( patch("vllm.attention.layer.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), - patch( - "vllm.attention.layer.check_upstream_fa_availability", return_value=True - ), - patch.dict( - "sys.modules", - { - "flash_attn": type( - "MockFlashAttn", - (), - {"flash_attn_varlen_func": lambda *args, **kwargs: None}, - )() - }, - ), ): attn = MultiHeadAttention(16, 72, scale=1) assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2e4dd8bb808b..f9bf242b7194 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -267,24 +267,21 @@ def get_vit_attn_backend( ) -> "AttentionBackendEnum": from vllm.attention.backends.registry import AttentionBackendEnum - # For Blackwell GPUs, force TORCH_SDPA for now. - # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 - if cls.has_device_capability(100): - return AttentionBackendEnum.TORCH_SDPA - - if dtype not in (torch.float16, torch.bfloat16): - return AttentionBackendEnum.XFORMERS - - if cls.has_device_capability(80): + # Try FlashAttention first + try: backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() if backend_class.supports_head_size( head_size ) and backend_class.supports_dtype(dtype): return AttentionBackendEnum.FLASH_ATTN - else: - return AttentionBackendEnum.XFORMERS + except ImportError: + pass + + if cls.has_device_capability(100): + # xFormers doesn't support Blackwell, fall back to SDPA + # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 + return AttentionBackendEnum.TORCH_SDPA else: - # Fallback for Volta/Turing GPUs or FA not supported return AttentionBackendEnum.XFORMERS @classmethod diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a5d4435000d4..fdc99a0df1c8 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -119,8 +119,8 @@ def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") @classmethod - def get_supported_head_sizes(cls) -> list[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] + def supports_head_size(cls, head_size: int) -> bool: + return head_size % 8 == 0 and head_size <= 256 @classmethod def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool: From 468a8d72bac181c1499320478940cec64363e107 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Tue, 18 Nov 2025 21:05:22 -0800 Subject: [PATCH 173/578] [Bugfix] Fix FusedMoEModularKernel for triton backend (#28913) Signed-off-by: Xin Yang --- vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b95d1a6b3a1f..66ae2e94c60a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -755,8 +755,10 @@ def _interleave_mxfp4_cutlass_sm90(w): self.w13_weight = w13_weight self.w2_weight = w2_weight - layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False) - layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False) + del layer.w13_weight + del layer.w2_weight + layer.w13_weight = w13_weight + layer.w2_weight = w2_weight else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") @@ -1065,8 +1067,8 @@ def apply( return triton_kernel_moe_forward( hidden_states=x, - w1=self.w13_weight, - w2=self.w2_weight, + w1=layer.w13_weight, + w2=layer.w2_weight, gating_output=router_logits, topk=top_k, renormalize=renormalize, From 73ff872db0d4e3f5e133d5d2a5307248619d93a6 Mon Sep 17 00:00:00 2001 From: Gleb Kurchanov Date: Wed, 19 Nov 2025 08:21:02 +0300 Subject: [PATCH 174/578] [Bugfix] Fix typo in Qwen3 Next model executor (#28960) Signed-off-by: Gleb Kurchanov --- vllm/model_executor/models/qwen3_next.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 86508a7c6431..0415c8e00fdf 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1154,8 +1154,8 @@ def set_moe_parameters(self): example_moe = layer.mlp self.moe_layers.append(layer.mlp.experts) - if example_moe is None: - raise RuntimeError("No Qwen3Next layer found in the model.layers.") + if example_moe is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") # Set MoE hyperparameters self.num_moe_layers = len(self.moe_layers) From 6a25ea5f0ea193e35b5a83cb0285c48964bc9eb1 Mon Sep 17 00:00:00 2001 From: Uranus <109661872+UranusSeven@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:30:08 +0800 Subject: [PATCH 175/578] [Docs] Update oneshot imports (#28188) Signed-off-by: UranusSeven <109661872+UranusSeven@users.noreply.github.com> --- docs/features/quantization/fp8.md | 2 +- docs/features/quantization/int4.md | 2 +- docs/features/quantization/int8.md | 2 +- docs/features/quantization/quantized_kvcache.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 0c5111fb8af0..d4a6176b236f 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -60,7 +60,7 @@ Since simple RTN does not require data for weight quantization and the activatio ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier # Configure the simple PTQ quantization diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 035e7ea291f9..9752039097d6 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -80,7 +80,7 @@ Now, apply the quantization algorithms: ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index ec8a77f74ffe..701ca6378cb1 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -87,7 +87,7 @@ Now, apply the quantization algorithms: ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index 56cf057678be..d26a5e217f31 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -78,7 +78,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models ```python from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot # Select model and load it MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" From 3d4e7d34be856cc4f54033e6a019059afacb5e76 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 19 Nov 2025 05:43:01 +0000 Subject: [PATCH 176/578] [Model][QwenVL] Simplify cos/sin rotary embedding indexing (#28962) Signed-off-by: Lukas Geiger --- vllm/model_executor/models/glm4_1v.py | 9 ++------- vllm/model_executor/models/qwen2_5_vl.py | 9 ++------- vllm/model_executor/models/qwen2_vl.py | 9 ++------- .../models/qwen3_omni_moe_thinker.py | 9 ++------- vllm/model_executor/models/qwen3_vl.py | 17 +++-------------- 5 files changed, 11 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 2c2f45c2453e..7a4fee76ae6b 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -797,13 +797,8 @@ def rot_pos_emb( # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined, pos_ids def compute_attn_mask_seqlen( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 2e4fd9645d88..5b5d50ec8935 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -738,13 +738,8 @@ def rotary_pos_emb_thw(self, t, h, w): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) cos_combined = cos_combined.reshape( cos_combined.shape[0] // self.spatial_merge_unit, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 53df5972a8fe..cda8eaf5377f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -724,13 +724,8 @@ def rot_pos_emb( # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined def compute_attn_mask_seqlen( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 8274b92138f7..d2fd74a5e41a 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -428,13 +428,8 @@ def rot_pos_emb(self, grid_thw): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 99a4007ef7f2..0c546309400b 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -459,18 +459,13 @@ def rot_pos_emb(self, grid_thw: list[list[int]]): else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1) for t, h, w in grid_thw ] - pos_ids = torch.cat(pos_ids, dim=0) + pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True) # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined @@ -566,12 +561,6 @@ def forward( pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - rotary_pos_emb_cos = rotary_pos_emb_cos.to( - hidden_states.device, non_blocking=True - ) - rotary_pos_emb_sin = rotary_pos_emb_sin.to( - hidden_states.device, non_blocking=True - ) cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] From 71d0ae1c54543689ea7541aa20b9522982b0815e Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 19 Nov 2025 09:28:40 +0300 Subject: [PATCH 177/578] [Misc] Update embedding/cross encoder tests to use `mteb` v2 (#27329) Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: Cyrus Leung Co-authored-by: Isotr0py Co-authored-by: wang.yuqi Co-authored-by: wang.yuqi --- requirements/test.in | 2 +- requirements/test.txt | 4 +- .../language/pooling_mteb_test/mteb_utils.py | 179 +++++++++++------- .../test_bge_reranker_v2_gemma.py | 31 ++- .../pooling_mteb_test/test_mxbai_rerank.py | 5 +- .../pooling_mteb_test/test_qwen3_reranker.py | 5 +- 6 files changed, 143 insertions(+), 83 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 30d97e9b9c7d..05f6bcca5c2c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test # TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test -mteb[bm25s]>=1.38.11, <2 # required for mteb test +mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.1 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 3263b74c0879..bcd511660f85 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -201,8 +201,6 @@ email-validator==2.2.0 # via pydantic encodec==0.1.1 # via vocos -eval-type-backport==0.2.2 - # via mteb evaluate==0.4.3 # via lm-eval fastapi==0.116.1 @@ -490,7 +488,7 @@ msgpack==1.1.0 # via # librosa # ray -mteb==1.38.11 +mteb==2.1.2 # via -r requirements/test.in multidict==6.1.0 # via diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 0384ff82790f..189cdbae99dc 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile -from collections.abc import Sequence import mteb import numpy as np import requests import torch +from mteb.models import ModelMeta +from mteb.types import Array +from torch.utils.data import DataLoader import tests.ci_envs as ci_envs from tests.models.utils import ( @@ -27,24 +29,47 @@ # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] -MTEB_RERANK_LANGS = ["en"] +MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_TOL = 2e-3 +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class VllmMtebEncoder(mteb.EncoderProtocol): + mteb_model_meta = _empty_model_meta -class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): - super().__init__() self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( self, - sentences: Sequence[str], + inputs: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] outputs = self.llm.embed(sentences, use_tqdm=False) @@ -52,36 +77,70 @@ def encode( embeds = embeds[np.argsort(r)] return embeds + def similarity( + self, + embeddings1: np.ndarray, + embeddings2: np.ndarray, + ) -> np.ndarray: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) + return sim + + def similarity_pairwise( + self, + embeddings1: Array, + embeddings2: Array, + ) -> Array: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.sum(embeddings1 * embeddings2, axis=1) / ( + norm1.flatten() * norm2.flatten() + ) + return sim + + +class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - - queries = [s[0] for s in sentences] - corpus = [s[1] for s in sentences] + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] outputs = self.llm.score( queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False ) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores -class OpenAIClientMtebEncoder(mteb.Encoder): +class OpenAIClientMtebEncoder(VllmMtebEncoder): def __init__(self, model_name: str, client): - super().__init__() self.model_name = model_name self.client = client self.rng = np.random.default_rng(seed=42) - def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] @@ -94,28 +153,29 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: return embeds -class ScoreClientMtebEncoder(mteb.Encoder): +class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + def __init__(self, model_name: str, url): - super().__init__() self.model_name = model_name self.url = url self.rng = np.random.default_rng(seed=42) def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] + queries = [text for batch in inputs1 for text in batch["text"]] + full_corpus = [text for batch in inputs2 for text in batch["text"]] outputs = [] - for query, corpus, prompt in sentences: + for query, corpus in zip(queries, full_corpus): outputs.append(self.get_score(query, corpus)) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores def get_score(self, query, corpus): @@ -145,16 +205,13 @@ def get_score(self, query, corpus): return response["results"][0]["relevance_score"] -def run_mteb_embed_task(encoder, tasks): +def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): tasks = mteb.get_tasks(tasks=tasks) - evaluation = mteb.MTEB(tasks=tasks) - results = evaluation.run( + results = mteb.evaluate( encoder, - verbosity=0, - output_folder=None, - encode_kwargs={ - "show_progress_bar": False, - }, + tasks, + cache=None, + show_progress_bar=False, ) main_score = results[0].scores["test"][0]["main_score"] @@ -244,33 +301,39 @@ def mteb_test_embed_models( assert st_main_score - vllm_main_score < atol -def run_mteb_rerank(cross_encoder, tasks, languages): - with tempfile.TemporaryDirectory() as results_folder: +def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): + with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") - tasks = mteb.get_tasks(tasks=tasks, languages=languages) - - subset = "default" eval_splits = ["test"] - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run( + mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks( + tasks=tasks, languages=languages, eval_splits=eval_splits + ) + + mteb.evaluate( bm25s, - verbosity=0, - eval_splits=eval_splits, - save_predictions=True, - output_folder=f"{results_folder}/stage1", - encode_kwargs={"show_progress_bar": False}, + mteb_tasks, + prediction_folder=prediction_folder, + show_progress_bar=False, + # don't save results for test runs + cache=None, + overwrite_strategy="always", ) - results = evaluation.run( + second_stage_tasks = [] + for task in mteb_tasks: + second_stage_tasks.append( + task.convert_to_reranking( + prediction_folder, + top_k=10, + ) + ) + + results = mteb.evaluate( cross_encoder, - verbosity=0, - eval_splits=eval_splits, - top_k=10, - save_predictions=True, - output_folder=f"{results_folder}/stage2", - previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json", - encode_kwargs={"show_progress_bar": False}, + second_stage_tasks, + show_progress_bar=False, + cache=None, ) main_score = results[0].scores["test"][0]["main_score"] return main_score @@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf( hf_runner, model_name, hf_dtype="float32", hf_model_callback=None ): with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: - original_predict = hf_model.predict - - def _predict( - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt - *args, - **kwargs, - ): - # vllm and st both remove the prompt, fair comparison. - prompts = [(s[0], s[1]) for s in sentences] - return original_predict(prompts, *args, **kwargs, batch_size=8) - - hf_model.predict = _predict - hf_model.original_predict = original_predict - if hf_model_callback is not None: hf_model_callback(hf_model) @@ -310,7 +359,7 @@ def mteb_test_rerank_models( model_info: RerankModelInfo, vllm_extra_kwargs=None, hf_model_callback=None, - vllm_mteb_encoder=VllmMtebEncoder, + vllm_mteb_encoder=VllmMtebCrossEncoder, atol=MTEB_RERANK_TOL, ): vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 2927a3711136..6b2e46964492 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -2,13 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any +import mteb import numpy as np import pytest import torch +from torch.utils.data import DataLoader from tests.conftest import HfRunner from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebEncoder, + VllmMtebCrossEncoder, mteb_test_rerank_models, ) from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo @@ -103,7 +105,7 @@ def get_inputs(pairs, tokenizer, prompt=None): return torch.Tensor(scores) -class GemmaMtebEncoder(VllmMtebEncoder): +class GemmaMtebEncoder(VllmMtebCrossEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.query_template = "A: {query}\n" @@ -111,17 +113,26 @@ def __init__(self, *args, **kwargs): def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - _sentences = [] - for query, corpus, prompt in sentences: - query = self.query_template.format(query=query) - corpus = self.document_template.format(doc=corpus, prompt=PROMPT) - _sentences.append((query, corpus, prompt)) - - return super().predict(_sentences, *args, **kwargs) + queries = [ + self.query_template.format(query=text) + for batch in inputs1 + for text in batch["text"] + ] + corpus = [ + self.document_template.format(doc=text, prompt=PROMPT) + for batch in inputs2 + for text in batch["text"] + ] + outputs = self.llm.score( + queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False + ) + scores = np.array(outputs) + return scores @pytest.mark.parametrize("model_info", RERANK_MODELS) diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index fd04dc199023..a6f2a89b268f 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -70,8 +70,9 @@ def compute_logits(inputs): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 00e99f44cfdb..9a1be6c0be1d 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -72,8 +72,9 @@ def compute_logits(inputs): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) From a4511e38db375a85b4dd784c2c38528747288f46 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 19 Nov 2025 01:46:32 -0500 Subject: [PATCH 178/578] Speed up macOS smoke test (#28954) Signed-off-by: Michael Goin Signed-off-by: mgoin --- .github/workflows/macos-smoke-test.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 42b05ecd5ac0..a183033c9add 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -9,7 +9,7 @@ on: jobs: macos-m1-smoke-test: runs-on: macos-latest - timeout-minutes: 20 + timeout-minutes: 30 steps: - uses: actions/checkout@v4 @@ -37,15 +37,14 @@ jobs: - name: Verify installation run: | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" - python -c "import torch; print(f'PyTorch: {torch.__version__}')" - name: Smoke test vllm serve - timeout-minutes: 10 run: | # Start server in background vllm serve Qwen/Qwen3-0.6B \ - --max-model-len=2048 \ + --max-model-len=2K \ --load-format=dummy \ + --hf-overrides '{"num_hidden_layers": 2}' \ --enforce-eager \ --port 8000 & From 7ed27f3cb55e3f64614300ec7acde1b382a48541 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 19 Nov 2025 07:52:30 +0100 Subject: [PATCH 179/578] [Doc]: fix typos in various files (#28945) Signed-off-by: Didier Durand --- docs/design/moe_kernel_features.md | 4 ++-- docs/design/plugin_system.md | 2 +- docs/features/quantization/quark.md | 2 +- examples/online_serving/prometheus_grafana/README.md | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/envs.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 7663b82266f0..36ae9506b65f 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -4,7 +4,7 @@ The purpose of this document is to provide an overview of the various MoE kernel ## Fused MoE Modular All2All backends -There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend. +There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend. The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support. @@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. ## Fused MoE Experts Kernels -The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index dc2f7c4aed3c..e8db8047ca4e 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -49,7 +49,7 @@ Every plugin has three parts: - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. -- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. +- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase. diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index bd7bc186e13a..c54d7d225199 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -306,7 +306,7 @@ As examples, we provide some ready-to-use quantized mixed precision model to sho ### 2. inference the quantized mixed precision model in vLLM -Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow: +Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follows: ```bash lm_eval --model vllm \ diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index 5cd4dab5a8fa..9615210a2ad8 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -46,7 +46,7 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. -On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`. +On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`. Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab6e5e594c23..e2f7326448b3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1500,7 +1500,7 @@ def create_engine_config( # Local DP rank = 1, use pure-external LB. if data_parallel_external_lb: assert self.data_parallel_rank is not None, ( - "data_parallel_rank or node_rank must be spefified if " + "data_parallel_rank or node_rank must be specified if " "data_parallel_external_lb is enable." ) assert self.data_parallel_size_local in (1, None), ( diff --git a/vllm/envs.py b/vllm/envs.py index 6d92d5afee50..e61fb114325c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1261,7 +1261,7 @@ def get_vllm_port() -> int | None: # MoE routing strategy selector. # See `RoutingSimulator.get_available_strategies()` # for available # strategies. - # Cutstom routing strategies can be registered by + # Custom routing strategies can be registered by # RoutingSimulator.register_strategy() # Note: custom strategies may not produce correct model outputs "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get( From ae4821a1086325decbc801d3292dee42e42549bb Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 18 Nov 2025 23:47:57 -0800 Subject: [PATCH 180/578] Add CPU support model (#28697) Signed-off-by: Tsai, Louie --- docs/models/hardware_supported_models/cpu.md | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 docs/models/hardware_supported_models/cpu.md diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md new file mode 100644 index 000000000000..0832755f8fbe --- /dev/null +++ b/docs/models/hardware_supported_models/cpu.md @@ -0,0 +1,26 @@ +# CPU - Intel® Xeon® + +## Supported Models + +### Text-only Language Models + +| Model | Architecture | Supported | +|--------------------------------------|-------------------------------------------|-----------| +| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ | +| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ | +| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ | +| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ | +| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ | +| zai-org/GLM-4.5 | GLMForCausalLM | ✅ | +| google/gemma | GemmaForCausalLM | ✅ | + +### Multimodal Language Models + +| Model | Architecture | Supported | +|--------------------------------------|-------------------------------------------|-----------| +| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ | +| openai/whisper | WhisperForConditionalGeneration | ✅ | + +✅ Runs and optimized. +🟨 Runs and correct but not optimized to green yet. +❌ Does not pass accuracy test or does not run. From d69062c67af46a2e624be92162e9db585eef329b Mon Sep 17 00:00:00 2001 From: gnovack Date: Wed, 19 Nov 2025 00:32:00 -0800 Subject: [PATCH 181/578] add support for --fully-sharded-loras in fused_moe (#28761) Signed-off-by: gnovack Co-authored-by: Jee Jee Li --- tests/lora/test_fused_moe_lora_kernel.py | 208 +++++++++++++++++- tests/lora/test_olmoe_tp.py | 10 +- vllm/lora/layers/fused_moe.py | 36 ++- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 24 +- vllm/lora/punica_wrapper/punica_base.py | 2 + vllm/lora/punica_wrapper/punica_gpu.py | 4 + 6 files changed, 274 insertions(+), 10 deletions(-) diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 91ab4a87c65f..91c8b861c3c5 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -1,13 +1,25 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os import random import pytest import torch +from tests.utils import multi_gpu_test from vllm import _custom_ops as ops +from vllm.distributed import ( + init_distributed_environment, + initialize_model_parallel, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size, +) from vllm.lora.ops.triton_ops import fused_moe_lora from vllm.platforms import current_platform +from vllm.utils.network_utils import get_open_port @pytest.fixture(autouse=True) @@ -122,6 +134,8 @@ def use_fused_moe_lora_kernel( max_loras, num_experts, block_size, + fully_sharded=False, + offset=0, ): max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) @@ -195,10 +209,10 @@ def use_fused_moe_lora_kernel( config["NUM_STAGES"], config["SPLIT_K"], mul_routed_weight, + fully_sharded=fully_sharded, + offset=offset, ) - return output - def use_torch( hidden_states, @@ -317,3 +331,193 @@ def test_fused_moe_lora_kernel( ) torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("num_tokens", [100]) +@pytest.mark.parametrize("top_k_num", [6]) +@pytest.mark.parametrize("num_experts", [64]) +@pytest.mark.parametrize("max_loras", [4]) +@pytest.mark.parametrize("N", [1408]) +@pytest.mark.parametrize("K", [2048]) +@pytest.mark.parametrize("max_lora_rank", [16, 32, 64]) +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("column_parallel", [True, False]) +def test_fused_moe_lora_kernel_fully_sharded( + num_tokens, + top_k_num, + num_experts, + max_loras, + N, + K, + max_lora_rank, + block_size, + dtype, + seed, + column_parallel, +): + current_platform.seed_everything(seed) + # the number of randomly generated sentences. + num_sequences = 10 + # generate data + topk_ids, topk_weights, token_lora_mapping = sample_data( + num_tokens, num_sequences, max_loras, num_experts, top_k_num + ) + + def run_torch_spawn(fn, nprocs): + torch.multiprocessing.spawn( + fn, + args=( + nprocs, + f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}", + dtype, + seed, + N, + K, + num_tokens, + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + max_loras, + num_experts, + block_size, + column_parallel, + ), + nprocs=nprocs, + ) + + run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2) + + +def use_fused_moe_lora_kernel_tensor_parallel( + local_rank, + world_size, + init_method, + dtype, + seed, + N, + K, + num_tokens, + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + max_loras, + num_experts, + block_size, + column_parallel, +): + def _get_shard_slice(shard_size): + return slice(local_rank * shard_size, (local_rank + 1) * shard_size) + + current_platform.seed_everything(seed) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + init_distributed_environment( + world_size=world_size, + rank=local_rank, + local_rank=local_rank, + distributed_init_method=init_method, + ) + initialize_model_parallel(world_size, 1) + tp_size = get_tensor_model_parallel_world_size() + + input_dim = K if column_parallel else N + output_dim = N if column_parallel else K + + # init lora weights + lora_a = torch.rand( + ( + max_loras, + num_experts, + max_lora_rank, + input_dim, + ), + dtype=dtype, + ) + lora_b = torch.rand( + ( + max_loras, + num_experts, + output_dim, + max_lora_rank, + ), + dtype=dtype, + ) + + hidden_states = torch.rand( + ( + num_tokens, + input_dim, + ), + dtype=dtype, + ) + + output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype) + topk_ids = topk_ids.to(device) + topk_weights = topk_weights.to(device) + token_lora_mapping = token_lora_mapping.to(device) + + ref_output = use_torch( + hidden_states, + token_lora_mapping, + topk_ids, + [lora_a], + [lora_b], + top_k_num, + ) + + if column_parallel: + # Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim, + # and Lora B is sliced along the output dim + lora_a_shard_size = max_lora_rank // tp_size + lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :] + max_lora_rank = lora_a_shard_size + offset = 0 + + lora_b_shard_size = output_dim // tp_size + lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :] + output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous() + else: + # Row parallel (e.g. down proj): LoRA A is sliced along the input dim, + # and LoRA B is sliced along the output dim + lora_a_shard_size = input_dim // tp_size + lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)] + hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)] + + lora_b_shard_size = output_dim // tp_size + lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :] + offset = lora_b_shard_size * local_rank + + use_fused_moe_lora_kernel( + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + [lora_a], + [lora_b], + hidden_states, + output, + max_loras, + num_experts, + block_size, + fully_sharded=True, + offset=offset, + ) + + if column_parallel: + output = tensor_model_parallel_all_gather(output) + else: + output = tensor_model_parallel_all_reduce(output) + + torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1) diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py index e659c1e1a9a0..e3c9816625ba 100644 --- a/tests/lora/test_olmoe_tp.py +++ b/tests/lora/test_olmoe_tp.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + import vllm from vllm.lora.request import LoRARequest @@ -111,8 +113,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files): generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None]) +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) @multi_gpu_test(num_gpus=2) -def test_olmoe_lora_tp2(olmoe_lora_files): +def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras): llm = vllm.LLM( MODEL_PATH, max_model_len=1024, @@ -122,14 +125,16 @@ def test_olmoe_lora_tp2(olmoe_lora_files): trust_remote_code=True, enable_chunked_prefill=True, tensor_parallel_size=2, + fully_sharded_loras=fully_sharded_loras, ) generate_and_test(llm, olmoe_lora_files, lora_id=1) generate_and_test(llm, olmoe_lora_files, lora_id=2) +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) @multi_gpu_test(num_gpus=4) -def test_olmoe_lora_tp4(olmoe_lora_files): +def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras): llm = vllm.LLM( MODEL_PATH, max_model_len=1024, @@ -139,6 +144,7 @@ def test_olmoe_lora_tp4(olmoe_lora_files): trust_remote_code=True, enable_chunked_prefill=True, tensor_parallel_size=4, + fully_sharded_loras=fully_sharded_loras, ) generate_and_test(llm, olmoe_lora_files, lora_id=1) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 8fb3efa220f6..3291c41fcda1 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -12,6 +12,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.ops.triton_ops.utils import get_lora_op_configs from vllm.model_executor.layers.fused_moe import FusedMoE @@ -205,6 +206,7 @@ def wrapper(*args, **kwargs): shrink_config, ## pass the shrink config expand_config, ## pass the expand config self.adapter_enabled, + fully_sharded=self.fully_sharded, ) result = func(*args, **kwargs) @@ -250,7 +252,10 @@ def wrapper(*args, **kwargs): sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) intermediate_cache2 = moe_state_dict["intermediate_cache2"] intermediate_cache3 = args[0] - max_lora_rank = self.w1_lora_a_stacked.shape[-2] + max_lora_rank = self.w2_lora_a_stacked.shape[-2] + + shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size) + self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, intermediate_cache2, @@ -266,6 +271,8 @@ def wrapper(*args, **kwargs): expand_config, ## pass the expand config self.adapter_enabled, True, + fully_sharded=self.fully_sharded, + offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0, ) result = func(*args, **kwargs) @@ -294,6 +301,7 @@ def create_lora_weights( model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + self.fully_sharded = lora_config.fully_sharded_loras self.adapter_enabled = torch.tensor( [0] * (max_loras + 1), dtype=torch.int, device=self.device @@ -303,7 +311,9 @@ def create_lora_weights( ( max_loras, self.base_layer.local_num_experts, - lora_config.max_lora_rank, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), self.base_layer.hidden_size, ), dtype=lora_config.lora_dtype, @@ -334,7 +344,9 @@ def create_lora_weights( ( max_loras, self.base_layer.local_num_experts, - self.base_layer.hidden_size, + self.base_layer.hidden_size + if not self.fully_sharded + else divide(self.base_layer.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -345,7 +357,9 @@ def create_lora_weights( ( max_loras, self.base_layer.local_num_experts, - lora_config.max_lora_rank, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), self.base_layer.hidden_size, ), dtype=lora_config.lora_dtype, @@ -419,6 +433,20 @@ def set_lora( w3_lora_b = w3_lora_b[start_idx:end_idx, :] w2_lora_a = w2_lora_a[:, start_idx:end_idx] + if self.fully_sharded: + # Based on S-LoRA, we slice W1 and W3 A along the rank dim, + # and W2 B along the hidden_size dim. + w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0] + w13_start_idx = self.tp_rank * w13_shard_size + w13_end_idx = (self.tp_rank + 1) * w13_shard_size + w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] + w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :] + + w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0] + w2_start_idx = self.tp_rank * w2_shard_size + w2_end_idx = (self.tp_rank + 1) * w2_shard_size + w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] + self.w1_lora_a_stacked[ index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] ].copy_(w1_lora_a, non_blocking=True) diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index e2dd47dbb4e6..413ee8ecbbf9 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -3,6 +3,10 @@ import torch +from vllm.distributed import ( + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op @@ -311,6 +315,7 @@ def _fused_moe_lora_expand( num_stages: int, split_k: int, mul_routed_weight: bool = False, + offset: int = 0, ) -> None: b_ptr = _get_ptr(lora_b_stacked, device) K = max_lora_rank @@ -380,7 +385,7 @@ def _fused_moe_lora_expand( **expand_config, ) for i in range(num_slices): - output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i] + output[:, :, i * N + offset : (i + 1) * N + offset] += b_intermediate_cache1[i] @torch.inference_mode() @@ -416,6 +421,8 @@ def _fused_moe_lora( expand_num_stages: int, expand_split_k: int, mul_routed_weight: bool = False, + fully_sharded: bool = False, + offset: int = 0, ) -> None: assert len(lora_a_stacked) == len(lora_b_stacked) > 0 assert ( @@ -430,7 +437,6 @@ def _fused_moe_lora( == expert_ids.shape[0] == num_tokens_post_padded.shape[0] ) - assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1] assert output.shape[0] == topk_weights.shape[0] assert top_k_num == topk_weights.shape[1] device = qcurr_hidden_states.device @@ -480,6 +486,19 @@ def _fused_moe_lora( mul_routed_weight, ) + if fully_sharded: + if max_lora_rank == w1_lora_b_stacked.shape[-1]: + a_intermediate_cache1 = tensor_model_parallel_all_reduce( + a_intermediate_cache1 + ) + else: + a_intermediate_cache1 = tensor_model_parallel_all_gather( + a_intermediate_cache1 + ) + + # reset max_lora_rank to the full rank after allgather + max_lora_rank = a_intermediate_cache1.shape[-1] + _fused_moe_lora_expand( output, a_intermediate_cache1, @@ -510,6 +529,7 @@ def _fused_moe_lora( expand_num_stages, expand_split_k, mul_routed_weight, + offset, ) diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index b6186e856152..a6ffbb7b71ce 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -483,6 +483,8 @@ def add_lora_fused_moe( expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, + fully_sharded: bool = False, + offset: int = 0, ): """ Performs a fused forward computation for LoRA of diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index ede50a48af98..d863a5884d3c 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -375,6 +375,8 @@ def add_lora_fused_moe( expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, + fully_sharded: bool = False, + offset: int = 0, ): """ Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer. @@ -408,4 +410,6 @@ def add_lora_fused_moe( expand_config.get("NUM_STAGES", 3), expand_config.get("SPLIT_K", 1), mul_routed_weight, + fully_sharded, + offset, ) From fdf93486d6c4f36be2f410a846bf68654041dc51 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Wed, 19 Nov 2025 18:35:29 +0800 Subject: [PATCH 182/578] [Docs] Clean up moe_kernel_features.md (#28530) Signed-off-by: windsonsea --- docs/design/moe_kernel_features.md | 92 +++++++++++++++--------------- 1 file changed, 45 insertions(+), 47 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 36ae9506b65f..f0d5a3e934f3 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -1,4 +1,4 @@ -# Fused MoE Kernel features +# Fused MoE Kernel Features The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels. @@ -8,15 +8,15 @@ There are a number of all2all communication backends that are used to implement The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support. -The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. +The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. -The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. +The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step). -Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag. +Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag. -unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP. +Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP. -| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class | -|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | -| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | -| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | -| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | -| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | -| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | +| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass | +|---------|--------------------|--------------|---------------|-------|-----------------------|-----------| +| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | +| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | +| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | +| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | +| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | +| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | +| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | +| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 2. A,T quantization occurs after dispatch. 3. All quantization happens after dispatch. 4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency") - 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. + 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. 6. This depends on the experts implementation. --- @@ -66,44 +65,43 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod] - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] -## Fused MoE Experts Kernels +## Fused Experts Kernels -There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. -Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. +Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`. Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type. -Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results. +Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results. As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts. Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`. -To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats. - -| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | -|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | -| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | -| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | -| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | -| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | -| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | -| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | -| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | -| marlin | standard | 3 | 3 | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| marlin experts | standard,
batched | N/A | N/A | silu,
swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | -| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | -| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | -| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | -| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | -| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | +To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats. + +| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | +|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------| +| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | +| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | +| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | +| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | +| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | +| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | +| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | +| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | +| marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | +| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | +| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | +| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | +| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | +| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | +| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 - 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params + 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params 3. uint4, uint8, fp8, fp4 4. This is a naive implementation of experts that supports batched format. Mainly used for testing. 5. The `activation` parameter is ignored and SwiGlu is used by default instead. @@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts. -| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | -|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| -| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | -| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts`| -| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | +| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | +|---------|-----------------------------------------|----------------------------------------------| +| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | +| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts` | +| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | From 815160958327d601933139b9e76a01eb6d2bc5cf Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:05:44 +0800 Subject: [PATCH 183/578] refactor(cpu_types_scalar.hpp): Unify scalar loop implementations using unroll_loop (#28847) Signed-off-by: ihb2032 <1355790728@qq.com> Co-authored-by: lyd1992 --- csrc/cpu/cpu_types_scalar.hpp | 222 +++++++++++++--------------------- 1 file changed, 87 insertions(+), 135 deletions(-) diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp index 1a9278bc662e..f9da78283da5 100644 --- a/csrc/cpu/cpu_types_scalar.hpp +++ b/csrc/cpu/cpu_types_scalar.hpp @@ -26,10 +26,6 @@ namespace vec_op { #define FORCE_INLINE __attribute__((always_inline)) inline -#define __max(a, b) ((a) > (b) ? (a) : (b)) -#define __min(a, b) ((a) < (b) ? (a) : (b)) -#define __abs(a) ((a) < (0) ? (0 - a) : (a)) - typedef struct f16x8_t { uint16_t val[8]; } f16x8_t; @@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec { explicit BF16Vec32(f16x32_t data) : reg(data) {}; explicit BF16Vec32(BF16Vec8& vec8_data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&vec8_data, this](int i) { reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM]; - } + }); } void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } @@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec { f32x4_t reg; explicit FP32Vec4(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec4() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec4(const float* ptr) @@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec { f32x8_t reg; explicit FP32Vec8(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec8() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec8(const float* ptr) @@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; explicit FP32Vec8(const FP16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } FP32Vec8(const BF16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } float reduce_sum() const { float result = 0; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } FP32Vec8 exp() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = expf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 tanh() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = tanhf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 er() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = erf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 operator*(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] * b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator+(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] + b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator-(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] - b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator/(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] / b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); return FP32Vec8(ret); } @@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec { f32x16_t reg; explicit FP32Vec16(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec16() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec16(const float* ptr) @@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x16_t data) : reg(data) {}; FP32Vec16(const FP32Vec4& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec8& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}; explicit FP32Vec16(const FP16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const BF16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; @@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec { FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; FP32Vec16 operator*(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] * b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator+(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] + b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator-(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] - b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator/(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] / b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 max(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __max(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::max(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 min(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __min(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::min(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 abs() const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __abs(reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); }); + return FP32Vec16(ret); } float reduce_sum() const { float result = 0.0f; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } float reduce_max() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __max(reg.val[i], result); - } + float result = std::numeric_limits::lowest(); + unroll_loop( + [&result, this](int i) { result = std::max(reg.val[i], result); }); return result; } float reduce_min() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __min(reg.val[i], result); - } + float result = std::numeric_limits::max(); + unroll_loop( + [&result, this](int i) { result = std::min(reg.val[i], result); }); return result; } @@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec { float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); float sum = 0.0; - int start = idx * group_size; - int end = (idx + 1) * group_size; - - for (; (start < VEC_ELEM_NUM) && (start < end); ++start) { - sum += reg.val[start]; - } - + const int start = idx * group_size; + unroll_loop( + [&sum, &start, this](int i) { sum += reg.val[start + i]; }); return sum; } @@ -477,17 +437,13 @@ inline void storeFP32(float v, c10::BFloat16* ptr) { } inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { @@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { } inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); } From bbc6c2f1e5bc856a9265dfa2b379ed1d242adc33 Mon Sep 17 00:00:00 2001 From: j20120307 Date: Wed, 19 Nov 2025 03:07:22 -0800 Subject: [PATCH 184/578] [CI/Build] Fix broken build on Apple M1 (#28999) Signed-off-by: Kan Zhu --- csrc/cpu/utils.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp index d8399c56f6af..d3def306b806 100644 --- a/csrc/cpu/utils.hpp +++ b/csrc/cpu/utils.hpp @@ -6,6 +6,10 @@ #include #include +#if defined(__APPLE__) + #include +#endif + #include "cpu_types.hpp" namespace cpu_utils { @@ -21,10 +25,12 @@ struct VecTypeTrait { using vec_t = vec_op::FP32Vec16; }; +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) template <> struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; +#endif template <> struct VecTypeTrait { @@ -44,9 +50,21 @@ struct Counter { inline int64_t get_l2_size() { static int64_t size = []() { +#if defined(__APPLE__) + // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. + int64_t l2_cache_size = 0; + size_t len = sizeof(l2_cache_size); + if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && + l2_cache_size > 0) { + return l2_cache_size >> 1; // use 50% of L2 cache + } + // Fallback if sysctlbyname fails + return 128LL * 1024 >> 1; // use 50% of 128KB +#else long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); assert(l2_cache_size != -1); return l2_cache_size >> 1; // use 50% of L2 cache +#endif }(); return size; } From 97cfa99d59375de6d5e4c17dc6aea955ae75b493 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 12:32:04 +0100 Subject: [PATCH 185/578] [Docs] Take env var definition out of folded admonition (#29005) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/configuration/env_vars.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 2c0a898754fa..f6d548a19d91 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system: All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -??? code - - ```python - --8<-- "vllm/envs.py:env-vars-definition" - ``` +```python +--8<-- "vllm/envs.py:env-vars-definition" +``` From ba558c029ad65ab4f040c8320607ebd87612cf08 Mon Sep 17 00:00:00 2001 From: Tova Movshovitz Date: Wed, 19 Nov 2025 13:37:11 +0200 Subject: [PATCH 186/578] [config] Expose `get_total_num_hidden_layers()` in ModelConfig (#28961) Signed-off-by: tovam Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/config/model.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 3e8790a26e0e..f61dbb6a695a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1369,11 +1369,7 @@ def get_num_experts(self) -> int: # Coerce to 0 if explicitly set to None return num_experts or 0 - def get_layers_start_end_indices( - self, parallel_config: ParallelConfig - ) -> tuple[int, int]: - from vllm.distributed.utils import get_pp_indices - + def get_total_num_hidden_layers(self) -> int: if ( self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" @@ -1393,6 +1389,15 @@ def get_layers_start_end_indices( total_num_hidden_layers = getattr( self.hf_text_config, "num_hidden_layers", 0 ) + return total_num_hidden_layers + + def get_layers_start_end_indices( + self, parallel_config: ParallelConfig + ) -> tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + + total_num_hidden_layers = self.get_total_num_hidden_layers() + # the layout order is: DP x PP x TP pp_rank = ( parallel_config.rank // parallel_config.tensor_parallel_size From da2f6800e0d6ac768c6f63b95f7c0755407f4263 Mon Sep 17 00:00:00 2001 From: Chen Bruce Date: Wed, 19 Nov 2025 20:46:24 +0800 Subject: [PATCH 187/578] [Feat][Perf] Enable deepep-low-latency with round-robin expert placement. (#28449) Signed-off-by: bruceszchen Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../layers/fused_moe/all2all_utils.py | 11 ++ .../fused_moe/deepep_ll_prepare_finalize.py | 30 +++- .../layers/fused_moe/fused_moe_method_base.py | 9 +- vllm/model_executor/layers/fused_moe/layer.py | 157 +++++++++++++++--- .../fused_moe/unquantized_fused_moe_method.py | 7 +- .../compressed_tensors_moe.py | 14 +- .../model_executor/layers/quantization/fp8.py | 7 +- .../layers/quantization/modelopt.py | 10 +- 8 files changed, 208 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py index 2dd625054339..86c50f39f007 100644 --- a/vllm/model_executor/layers/fused_moe/all2all_utils.py +++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py @@ -67,6 +67,7 @@ def maybe_roundup_layer_hidden_size( def maybe_make_prepare_finalize( moe: FusedMoEConfig, quant_config: FusedMoEQuantConfig | None, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> FusedMoEPrepareAndFinalize | None: if not moe.moe_parallel_config.use_all2all_kernels: return None @@ -134,6 +135,13 @@ def maybe_make_prepare_finalize( elif moe.use_deepep_ll_kernels: assert quant_config is not None + global_to_physical = physical_to_global = local_expert_global_ids = None + if routing_tables is not None: + ( + global_to_physical, + physical_to_global, + local_expert_global_ids, + ) = routing_tables all_to_all_args = dict( max_num_tokens_per_dp_rank=moe.max_num_tokens, token_hidden_size=moe.hidden_dim, @@ -155,6 +163,9 @@ def maybe_make_prepare_finalize( max_tokens_per_rank=moe.max_num_tokens, num_dispatchers=all2all_manager.world_size, use_fp8_dispatch=use_fp8_dispatch, + global_to_physical=global_to_physical, + physical_to_global=physical_to_global, + local_expert_global_ids=local_expert_global_ids, ) return prepare_finalize diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 06c9df317f7c..e0db248958b4 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -85,6 +85,9 @@ def __init__( max_tokens_per_rank: int, num_dispatchers: int, use_fp8_dispatch: bool = False, + global_to_physical: torch.Tensor | None = None, + physical_to_global: torch.Tensor | None = None, + local_expert_global_ids: torch.Tensor | None = None, ): super().__init__() @@ -97,6 +100,17 @@ def __init__( self.handles: list[tuple | None] = [None, None] self.num_dispatchers_ = num_dispatchers + topk_indices_dtype = self.topk_indices_dtype() + + def _maybe_cast(tensor: torch.Tensor | None) -> torch.Tensor | None: + if tensor is None or topk_indices_dtype is None: + return tensor + return tensor.to(dtype=topk_indices_dtype) + + self.global_to_physical = _maybe_cast(global_to_physical) + self.physical_to_global = _maybe_cast(physical_to_global) + self.local_expert_global_ids = _maybe_cast(local_expert_global_ids) + # We don't have enough information to determine if we should dispatch # activation scales in a packed ue8m0 format during object construction # time. This setting is handled by post_init_setup. @@ -136,6 +150,16 @@ def max_num_tokens_per_rank(self) -> int | None: def topk_indices_dtype(self) -> torch.dtype | None: return torch.int64 + def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor: + if self.global_to_physical is None: + return topk_ids + return self.global_to_physical[topk_ids] + + def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor: + if self.local_expert_global_ids is None: + return expert_topk_ids + return self.local_expert_global_ids[expert_topk_ids] + def _do_quant( self, x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], @@ -226,9 +250,10 @@ def prepare_async( a1 = a1 * topk_weights.to(a1.dtype) # Dispatch + dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids) expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch( a1, - topk_ids, + dispatch_topk_ids, self.max_tokens_per_rank, num_experts, use_fp8=self.use_fp8_dispatch, @@ -313,11 +338,12 @@ def _finalize( # weights have already been applied. combine_topk_weights = torch.ones_like(topk_weights) + combine_topk_ids = self._map_global_to_physical_ids(topk_ids) # TODO (varun) : Enable zero copy mode dbo_maybe_run_recv_hook() _, _, recv_hook = self.buffer.low_latency_combine( fused_expert_output, - topk_ids, + combine_topk_ids, combine_topk_weights, handle, async_finish=False, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 87f8c8d75a9b..073e90a4e680 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -50,10 +50,15 @@ def uses_weight_scale_2_pattern(self) -> bool: """ return False - def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> FusedMoEPrepareAndFinalize | None: from .all2all_utils import maybe_make_prepare_finalize - return maybe_make_prepare_finalize(self.moe, self.moe_quant_config) + return maybe_make_prepare_finalize( + self.moe, self.moe_quant_config, routing_tables + ) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 023132acfed3..c41995e4a913 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -5,7 +5,7 @@ from contextlib import nullcontext from enum import Enum from functools import partial -from typing import Literal, get_args, overload +from typing import Literal, cast, get_args, overload import torch import torch.nn.functional as F @@ -192,6 +192,42 @@ def determine_expert_map( return (local_num_experts, expert_map, expert_mask) +def determine_expert_placement_strategy( + expert_placement_strategy: ExpertPlacementStrategy, + moe_parallel_config: FusedMoEParallelConfig, + num_expert_group: int | None, + num_redundant_experts: int, + enable_eplb: bool, +) -> ExpertPlacementStrategy: + if expert_placement_strategy == "round_robin": + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + return "linear" + if ( + moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.use_deepep_ll_kernels + ): + logger.warning( + "Round-robin expert placement currently only supports " + "the DeepEP low-latency backend, but '%s' was configured. " + "Falling back to linear expert placement.", + moe_parallel_config.all2all_backend, + ) + return "linear" + + return expert_placement_strategy + + def get_compressed_expert_map(expert_map: torch.Tensor) -> str: """ Compresses the expert map by removing any -1 entries. @@ -400,6 +436,9 @@ def __init__( self.expert_load_view: torch.Tensor | None = None self.logical_to_physical_map: torch.Tensor | None = None self.logical_replica_count: torch.Tensor | None = None + self.expert_placement_strategy: ExpertPlacementStrategy = ( + vllm_config.parallel_config.expert_placement_strategy + ) # ROCm aiter shared experts fusion self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() @@ -433,38 +472,27 @@ def __init__( "Redundant experts are only supported with EPLB." ) - expert_placement_strategy = ( - vllm_config.parallel_config.expert_placement_strategy + self.expert_placement_strategy = determine_expert_placement_strategy( + expert_placement_strategy=self.expert_placement_strategy, + moe_parallel_config=self.moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=self.enable_eplb, ) - if expert_placement_strategy == "round_robin": - # TODO(Bruce): will support round robin expert placement with - # EPLB enabled in the future. - round_robin_supported = ( - (num_expert_group is not None and num_expert_group > 1) - and num_redundant_experts == 0 - and not self.enable_eplb - ) - - if not round_robin_supported: - logger.warning( - "Round-robin expert placement is only supported for " - "models with multiple expert groups and no redundant " - "experts. Falling back to linear expert placement." - ) - expert_placement_strategy = "linear" self.expert_map: torch.Tensor | None local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts, - expert_placement_strategy=expert_placement_strategy, + expert_placement_strategy=self.expert_placement_strategy, num_fused_shared_experts=self.num_fused_shared_experts, return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts self.register_buffer("expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) + self._maybe_init_expert_routing_tables() logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " "placement strategy: %s. Local/global" @@ -472,7 +500,7 @@ def __init__( " %s.", self.ep_rank, self.ep_size, - expert_placement_strategy, + self.expert_placement_strategy, self.local_num_experts, self.global_num_experts, get_compressed_expert_map(self.expert_map), @@ -621,7 +649,12 @@ def _get_quant_method() -> FusedMoEMethodBase: # should be safe to swap out the quant_method. def maybe_init_modular_kernel(self) -> None: self.ensure_moe_quant_config_init() - prepare_finalize = self.quant_method.maybe_make_prepare_finalize() + # routing_tables only needed for round-robin expert placement with + # DeepEP all2all backend. + routing_tables = self._maybe_init_expert_routing_tables() + prepare_finalize = self.quant_method.maybe_make_prepare_finalize( + routing_tables=routing_tables + ) if prepare_finalize is not None: logger.debug( "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) @@ -703,6 +736,84 @@ def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass return False + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + # Currently routing_tables only needed for round-robin expert placement + # with DeepEP-ll all2all backend. + if ( + self.expert_placement_strategy != "round_robin" + or not self.use_deepep_ll_kernels + ): + return None + + if hasattr(self, "expert_global_to_physical"): + return cast( + tuple[torch.Tensor, torch.Tensor, torch.Tensor], + ( + self.expert_global_to_physical, + self.expert_physical_to_global, + self.expert_local_to_global, + ), + ) + + if self.expert_map is None: + return None + + routing_tables = self.ensure_round_robin_expert_routing_tables( + global_num_experts=self.global_num_experts, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + local_num_experts=self.local_num_experts, + device=self.expert_map.device, + ) + + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) + + return routing_tables + + @staticmethod + def ensure_round_robin_expert_routing_tables( + global_num_experts: int, + ep_size: int, + ep_rank: int, + local_num_experts: int, + device: torch.device | None = None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + device_kwargs = {"device": device} if device is not None else {} + global_indices = torch.arange( + global_num_experts, dtype=torch.long, **device_kwargs + ) + owner = torch.remainder(global_indices, ep_size) + local_index = torch.div(global_indices, ep_size, rounding_mode="floor") + base = global_num_experts // ep_size + remainder = global_num_experts % ep_size + physical_offset = owner * base + if remainder > 0: + remainder_tensor = torch.tensor( + remainder, dtype=torch.long, **device_kwargs + ) + physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) + + global_to_physical = physical_offset + local_index + physical_to_global = torch.empty_like(global_to_physical) + physical_to_global[global_to_physical] = global_indices + + local_global = torch.arange( + ep_rank, + global_num_experts, + ep_size, + dtype=torch.long, + **device_kwargs, + ) + if local_global.numel() != local_num_experts: + local_global = local_global[:local_num_experts] + + return (global_to_physical, physical_to_global, local_global) + def update_expert_map(self): # ep_size and ep_rank should already be updated assert self.expert_map is not None @@ -711,12 +822,14 @@ def update_expert_map(self): ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts, + expert_placement_strategy=self.expert_placement_strategy, num_fused_shared_experts=self.num_fused_shared_experts, return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts self.register_buffer("expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) + self._maybe_init_expert_routing_tables() if self.aiter_fmoe_shared_expert_enabled: self._init_aiter_shared_experts_topK_buffer( vllm_config=get_current_vllm_config(), diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 2e0376553b91..63b0e6f573d6 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -108,11 +108,14 @@ def supports_eplb(self) -> bool: def allow_inplace(self) -> bool: return True - def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> FusedMoEPrepareAndFinalize | None: if self.rocm_aiter_moe_enabled: return None else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 06ee96d55419..22b3c477f420 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -380,11 +380,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: (layer.w2_input_global_scale), requires_grad=False ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin: return None elif not self.allow_flashinfer: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe) logger.debug_once("%s", prepare_finalize.__class__.__name__) @@ -890,11 +893,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight_scale ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin or self.rocm_aiter_moe_enabled: return None else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 0479bec33840..92fbdd709348 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1018,7 +1018,10 @@ def process_weights_after_loading(self, layer: Module) -> None: del layer.w13_input_scale del layer.w2_input_scale - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if ( self.rocm_aiter_moe_enabled or self.use_marlin @@ -1039,7 +1042,7 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 476521813f46..38ab7cd4f115 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -373,6 +373,7 @@ def __init__( def maybe_make_prepare_finalize( self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: # TRT LLM not supported with all2all yet. if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: @@ -384,7 +385,7 @@ def maybe_make_prepare_finalize( logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, @@ -1179,7 +1180,10 @@ def __init__( " for ModelOptNvFp4FusedMoE." ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin or ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM @@ -1196,7 +1200,7 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, From 09540cd918a5f7d776d7f7e0abec78fbc03938ad Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:56:21 +0100 Subject: [PATCH 188/578] [Doc]: fix typos in various files (#29010) Signed-off-by: Didier Durand --- docs/deployment/frameworks/skypilot.md | 2 +- docs/design/prefix_caching.md | 2 +- docs/features/nixl_connector_usage.md | 2 +- docs/getting_started/quickstart.md | 2 +- tests/v1/ec_connector/integration/README.md | 2 +- vllm/multimodal/evs.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index f4a984a6433e..e9b0d5f0671c 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -4,7 +4,7 @@ vLLM

-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). ## Prerequisites diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index bd4070f381d8..48536a877bd3 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -1,6 +1,6 @@ # Automatic Prefix Caching -Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang). +Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang). While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block: diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 1ce038f4d652..f0e25e31aa0b 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -158,7 +158,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ ## Experimental Feature -### Heterogenuous KV Layout support +### Heterogeneous KV Layout support Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index cfc8b4d9838a..9e86f785b10c 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -286,7 +286,7 @@ If desired, you can also manually set the backend of your choice by configuring - On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`. -For AMD ROCm, you can futher control the specific Attention implementation using the following variables: +For AMD ROCm, you can further control the specific Attention implementation using the following variables: - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md index 30426e055ade..2dbcb307fda3 100644 --- a/tests/v1/ec_connector/integration/README.md +++ b/tests/v1/ec_connector/integration/README.md @@ -113,7 +113,7 @@ Quick sanity check: - Outputs differ between baseline and disagg - Server startup fails -- Encoder cache not found (should fallback to local execution) +- Encoder cache not found (should fall back to local execution) - Proxy routing errors ## Notes diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py index 4a288d2d238c..8a36ea415da4 100644 --- a/vllm/multimodal/evs.py +++ b/vllm/multimodal/evs.py @@ -185,7 +185,7 @@ def recompute_mrope_positions( Args: input_ids: (N,) All input tokens of the prompt (entire sequence). - multimodal_positions: List of mrope positsions for each media. + multimodal_positions: List of mrope positions for each media. mrope_positions: Existing mrope positions (4, N) for entire sequence. num_computed_tokens: A number of computed tokens so far. vision_start_token_id: Token indicating start of vision media. From 4f5299f7174ffb10bdc640b47d3494083fc39c48 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:50:30 +0100 Subject: [PATCH 189/578] Relax Transformers modeling backend MoE experts check (#28952) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 4 +++- vllm/model_executor/models/transformers/moe.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bd14bbb9ab66..80fe143269a7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -79,7 +79,9 @@ To make your model compatible with the Transformers modeling backend, it needs: 1. Add `is_causal = False` to `MyAttention`. - If your model is mixture-of-experts (MoE): 1. Your sparse MoE block must have an attribute called `experts`. - 2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`. + 2. The class of `experts` (`MyExperts`) must either: + - Inherit from `nn.ModuleList` (naive). + - Or contain all 3D `nn.Parameters` (packed). 3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 4973014c3d4e..31db9d682bd4 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -256,7 +256,14 @@ def forward(self, *args, **kwargs): def _recursive_replace(module: nn.Module, prefix: str): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - if child_name == "experts" and isinstance(child_module, nn.ModuleList): + # Naive implementations will have experts as ModuleList + is_modulelist = isinstance(child_module, nn.ModuleList) + # Packed implementations will have experts as 3D tensors of shapes like: + # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size) + # down_proj = (num_experts, intermediate_size, hidden_size) + params = list(child_module.parameters()) + is_3d = len(params) > 0 and all(p.ndim == 3 for p in params) + if child_name == "experts" and (is_modulelist or is_3d): # Alias for readability mlp = module experts = child_module From 2c8b9182b5ced00d83bed15ef8bc0ac6e079b6ee Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Wed, 19 Nov 2025 06:13:50 -0800 Subject: [PATCH 190/578] [CI] Reorganize compile tests so new tests are automatically included in CI (#28625) Signed-off-by: Yanan Cao --- .buildkite/test-amd.yaml | 57 ++++++++--------- .buildkite/test-pipeline.yaml | 62 +++++++++---------- tests/compile/README.md | 5 ++ .../{piecewise => distributed}/__init__.py | 0 .../{ => distributed}/test_async_tp.py | 6 +- .../test_fusion_all_reduce.py | 4 +- .../{ => distributed}/test_fusions_e2e.py | 2 +- .../test_sequence_parallelism.py | 4 +- tests/compile/fullgraph/__init__.py | 0 .../{ => fullgraph}/test_basic_correctness.py | 2 +- .../test_full_cudagraph.py | 0 .../{ => fullgraph}/test_full_graph.py | 2 +- .../test_multimodal_compile.py | 0 .../test_multiple_graphs.py | 0 .../{piecewise => fullgraph}/test_simple.py | 0 .../test_toy_llama.py | 0 vllm/env_override.py | 2 +- 17 files changed, 74 insertions(+), 72 deletions(-) create mode 100644 tests/compile/README.md rename tests/compile/{piecewise => distributed}/__init__.py (100%) rename tests/compile/{ => distributed}/test_async_tp.py (99%) rename tests/compile/{ => distributed}/test_fusion_all_reduce.py (99%) rename tests/compile/{ => distributed}/test_fusions_e2e.py (99%) rename tests/compile/{ => distributed}/test_sequence_parallelism.py (99%) create mode 100644 tests/compile/fullgraph/__init__.py rename tests/compile/{ => fullgraph}/test_basic_correctness.py (99%) rename tests/compile/{piecewise => fullgraph}/test_full_cudagraph.py (100%) rename tests/compile/{ => fullgraph}/test_full_graph.py (99%) rename tests/compile/{ => fullgraph}/test_multimodal_compile.py (100%) rename tests/compile/{piecewise => fullgraph}/test_multiple_graphs.py (100%) rename tests/compile/{piecewise => fullgraph}/test_simple.py (100%) rename tests/compile/{piecewise => fullgraph}/test_toy_llama.py (100%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 2471b509a9ff..0049f3540340 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -187,7 +187,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -215,7 +215,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -493,17 +493,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - # - pytest -v -s compile/test_sequence_parallelism.py - # - pytest -v -s compile/test_async_tp.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -515,9 +510,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -529,10 +526,10 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" - label: Cudagraph test timeout_in_minutes: 20 @@ -1066,10 +1063,10 @@ steps: - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1086,14 +1083,14 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 @@ -1198,7 +1195,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1211,7 +1208,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1417,10 +1414,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4ac76aba67b9..e62cd60efaec 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -167,7 +167,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -197,7 +197,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -445,18 +445,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_graph_partition.py - - pytest -v -s compile/test_config.py - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py - - pytest -v -s compile/test_qk_norm_rope_fusion.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -466,9 +460,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -479,10 +475,10 @@ steps: - tests/compile commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -939,17 +935,22 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -966,12 +967,11 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1069,7 +1069,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1084,7 +1084,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1264,10 +1264,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 diff --git a/tests/compile/README.md b/tests/compile/README.md new file mode 100644 index 000000000000..300a95686000 --- /dev/null +++ b/tests/compile/README.md @@ -0,0 +1,5 @@ +# compile test folder structure + +- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically +- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically +- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs. diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/distributed/__init__.py similarity index 100% rename from tests/compile/piecewise/__init__.py rename to tests/compile/distributed/__init__.py diff --git a/tests/compile/test_async_tp.py b/tests/compile/distributed/test_async_tp.py similarity index 99% rename from tests/compile/test_async_tp.py rename to tests/compile/distributed/test_async_tp.py index 71ee22878143..86d409f1eadb 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/distributed/test_async_tp.py @@ -27,13 +27,13 @@ from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..models.registry import HF_EXAMPLE_MODELS -from ..utils import ( +from ...models.registry import HF_EXAMPLE_MODELS +from ...utils import ( compare_two_settings, create_new_process_for_each_test, multi_gpu_test, ) -from .backend import TestBackend +from ..backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py similarity index 99% rename from tests/compile/test_fusion_all_reduce.py rename to tests/compile/distributed/test_fusion_all_reduce.py index 6d0a0ed7d89d..d401d5703275 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/distributed/test_fusion_all_reduce.py @@ -33,8 +33,8 @@ from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..utils import has_module_attribute, multi_gpu_test -from .backend import TestBackend +from ...utils import has_module_attribute, multi_gpu_test +from ..backend import TestBackend class TestAllReduceRMSNormModel(torch.nn.Module): diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py similarity index 99% rename from tests/compile/test_fusions_e2e.py rename to tests/compile/distributed/test_fusions_e2e.py index f22d60ef000b..2e1b595a4389 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -18,7 +18,7 @@ from vllm.utils.flashinfer import has_flashinfer from vllm.utils.torch_utils import is_torch_equal_or_newer -from ..utils import flat_product, multi_gpu_test +from ...utils import flat_product, multi_gpu_test is_blackwell = lambda: current_platform.is_device_capability(100) """Are we running on Blackwell, a lot of tests depend on it""" diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py similarity index 99% rename from tests/compile/test_sequence_parallelism.py rename to tests/compile/distributed/test_sequence_parallelism.py index 9cd7f64b04af..30084dfd5a95 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -32,8 +32,8 @@ from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..utils import multi_gpu_test -from .backend import TestBackend +from ...utils import multi_gpu_test +from ..backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() prompts = [ diff --git a/tests/compile/fullgraph/__init__.py b/tests/compile/fullgraph/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py similarity index 99% rename from tests/compile/test_basic_correctness.py rename to tests/compile/fullgraph/test_basic_correctness.py index 3f6898607f6b..965938c4433d 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -7,7 +7,7 @@ from vllm.config import CompilationMode from vllm.utils.torch_utils import cuda_device_count_stateless -from ..utils import compare_all_settings +from ...utils import compare_all_settings @dataclasses.dataclass diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py similarity index 100% rename from tests/compile/piecewise/test_full_cudagraph.py rename to tests/compile/fullgraph/test_full_cudagraph.py diff --git a/tests/compile/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py similarity index 99% rename from tests/compile/test_full_graph.py rename to tests/compile/fullgraph/test_full_graph.py index b4e5e56ac9fe..2c11ecef7f02 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -15,7 +15,7 @@ from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer -from ..utils import create_new_process_for_each_test +from ...utils import create_new_process_for_each_test def models_list(*, all: bool = True, keywords: list[str] | None = None): diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py similarity index 100% rename from tests/compile/test_multimodal_compile.py rename to tests/compile/fullgraph/test_multimodal_compile.py diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/fullgraph/test_multiple_graphs.py similarity index 100% rename from tests/compile/piecewise/test_multiple_graphs.py rename to tests/compile/fullgraph/test_multiple_graphs.py diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/fullgraph/test_simple.py similarity index 100% rename from tests/compile/piecewise/test_simple.py rename to tests/compile/fullgraph/test_simple.py diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py similarity index 100% rename from tests/compile/piecewise/test_toy_llama.py rename to tests/compile/fullgraph/test_toy_llama.py diff --git a/vllm/env_override.py b/vllm/env_override.py index 14dae2850c35..9ae1af3af46c 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -95,7 +95,7 @@ def get_output_names(graph_outputs) -> list[str]: # =================================================== # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to # fix inductor partition + attention-nvfp4 quant fusion, tested in -# `tests/compile/test_fusions_e2e.py::test_attn_quant`. +# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`. # For more context, see https://github.com/pytorch/pytorch/pull/165815. From 1ffe934c8ae978e5ed82559a1eaeca05e37f9b35 Mon Sep 17 00:00:00 2001 From: vnadathur Date: Wed, 19 Nov 2025 06:13:54 -0800 Subject: [PATCH 191/578] [torch.compile] caching of config fields should be opt-out by default (#26468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: vnadathur Signed-off-by: WorldExplored Signed-off-by: Srreyansh Sethi Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com> Co-authored-by: WorldExplored Co-authored-by: Srreyansh Sethi <107075589+worldexplored@users.noreply.github.com> Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com> Co-authored-by: Luka Govedič --- tests/config/test_config_utils.py | 166 +++++++++++++++++++++++++++++ vllm/compilation/backends.py | 105 +++++++++++++++---- vllm/compilation/pass_manager.py | 2 +- vllm/config/cache.py | 31 ++++-- vllm/config/compilation.py | 40 +++---- vllm/config/model.py | 92 ++++++++-------- vllm/config/parallel.py | 49 ++++++--- vllm/config/utils.py | 119 ++++++++++++++++++++- vllm/envs.py | 169 +++++++++++++++--------------- vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/lazy.py | 20 ++++ 11 files changed, 602 insertions(+), 193 deletions(-) create mode 100644 tests/config/test_config_utils.py create mode 100644 vllm/logging_utils/lazy.py diff --git a/tests/config/test_config_utils.py b/tests/config/test_config_utils.py new file mode 100644 index 000000000000..1277c7e64eb2 --- /dev/null +++ b/tests/config/test_config_utils.py @@ -0,0 +1,166 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from enum import Enum + +import pytest + +from vllm.config.utils import get_hash_factors, hash_factors, normalize_value + +# Helpers + + +def endswith_fqname(obj, suffix: str) -> bool: + # normalize_value(type) returns fully-qualified name + # Compare suffix to avoid brittle import paths. + out = normalize_value(obj) + return isinstance(out, str) and out.endswith(suffix) + + +def expected_path(p_str: str = ".") -> str: + import pathlib + + p = pathlib.Path(p_str) + return p.expanduser().resolve().as_posix() + + +# Minimal dataclass to test get_hash_factors. +# Avoid importing heavy vLLM configs. +@dataclass +class SimpleConfig: + a: object + b: object | None = None + + +class DummyLogprobsMode(Enum): + RAW_LOGITS = "raw_logits" + + +def test_hash_factors_deterministic(): + """Test that hash_factors produces consistent SHA-256 hashes""" + factors = {"a": 1, "b": "test"} + hash1 = hash_factors(factors) + hash2 = hash_factors(factors) + + assert hash1 == hash2 + # Dict key insertion order should not affect the hash. + factors_reordered = {"b": "test", "a": 1} + assert hash_factors(factors_reordered) == hash1 + assert len(hash1) == 64 + assert all(c in "0123456789abcdef" for c in hash1) + + +@pytest.mark.parametrize( + "inp, expected", + [ + (None, None), + (True, True), + (1, 1), + (1.0, 1.0), + ("x", "x"), + (b"ab", "6162"), + (bytearray(b"ab"), "6162"), + ([1, 2], (1, 2)), + ({"b": 2, "a": 1}, (("a", 1), ("b", 2))), + ], +) +def test_normalize_value_matrix(inp, expected): + """Parametric input→expected normalization table.""" + assert normalize_value(inp) == expected + + +def test_normalize_value_enum(): + # Enums normalize to (module.QualName, value). + # DummyLogprobsMode uses a string payload. + out = normalize_value(DummyLogprobsMode.RAW_LOGITS) + assert isinstance(out, tuple) + assert out[0].endswith("DummyLogprobsMode") + # Expect string payload 'raw_logits'. + assert out[1] == "raw_logits" + + +def test_normalize_value_set_order_insensitive(): + # Sets are unordered; normalize_value sorts elements for determinism. + assert normalize_value({3, 1, 2}) == normalize_value({1, 2, 3}) + + +def test_normalize_value_path_normalization(): + from pathlib import Path # local import to avoid global dependency + + # Paths expand/resolve to absolute strings. + # Stabilizes hashing across working dirs. + assert normalize_value(Path(".")) == expected_path(".") + + +def test_normalize_value_uuid_and_to_json(): + # Objects may normalize via uuid() or to_json_string(). + class HasUUID: + def uuid(self): + return "test-uuid" + + class ToJson: + def to_json_string(self): + return '{"x":1}' + + assert normalize_value(HasUUID()) == "test-uuid" + assert normalize_value(ToJson()) == '{"x":1}' + + +@pytest.mark.parametrize( + "bad", + [ + (lambda x: x), + (type("CallableInstance", (), {"__call__": lambda self: 0}))(), + (lambda: (lambda: 0))(), # nested function instance + ], +) +def test_error_cases(bad): + """Inputs expected to raise TypeError.""" + # Reject functions/lambdas/callable instances + # to avoid under-hashing. + with pytest.raises(TypeError): + normalize_value(bad) + + +def test_enum_vs_int_disambiguation(): + # int stays primitive + nf_int = normalize_value(1) + assert nf_int == 1 + + # enum becomes ("module.QualName", value) + nf_enum = normalize_value(DummyLogprobsMode.RAW_LOGITS) + assert isinstance(nf_enum, tuple) and len(nf_enum) == 2 + enum_type, enum_val = nf_enum + assert enum_type.endswith(".DummyLogprobsMode") + assert enum_val == "raw_logits" + + # Build factor dicts from configs with int vs enum + f_int = get_hash_factors(SimpleConfig(1), set()) + f_enum = get_hash_factors(SimpleConfig(DummyLogprobsMode.RAW_LOGITS), set()) + # The int case remains a primitive value + assert f_int["a"] == 1 + # The enum case becomes a tagged tuple ("module.QualName", "raw_logits") + assert isinstance(f_enum["a"], tuple) and f_enum["a"][1] == "raw_logits" + # Factor dicts must differ so we don't collide primitives with Enums. + assert f_int != f_enum + # Hash digests must differ correspondingly + assert hash_factors(f_int) != hash_factors(f_enum) + + # Hash functions produce stable hex strings + h_int = hash_factors(f_int) + h_enum = hash_factors(f_enum) + assert isinstance(h_int, str) and len(h_int) == 64 + assert isinstance(h_enum, str) and len(h_enum) == 64 + + +def test_classes_are_types(): + """Types normalize to FQNs; include real vLLM types.""" + # Only classes allowed; functions/lambdas are rejected. + # Canonical form is the fully-qualified name. + assert isinstance(normalize_value(str), str) + + class LocalDummy: + pass + + assert endswith_fqname(LocalDummy, ".LocalDummy") diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 60ef6eef2166..1e66f21ff638 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -4,12 +4,14 @@ import ast import dataclasses import hashlib +import json import operator import os import pprint import time from collections.abc import Callable, Sequence from contextlib import contextmanager +from functools import partial from typing import Any import torch @@ -23,7 +25,9 @@ should_split, ) from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig +from vllm.config.utils import hash_factors from vllm.logger import init_logger +from vllm.logging_utils import lazy from vllm.platforms import current_platform from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -580,35 +584,47 @@ def configure_post_pass(self): def __call__( self, graph: fx.GraphModule, example_inputs ) -> VllmSerializableFunction: - from .caching import _compute_code_hash, compilation_config_hash_factors - vllm_config = self.vllm_config + # Minimal hashing here with existing utilities, reused below. + + env_factors = envs.compile_factors() + env_hash = hash_factors(env_factors) + # Compute config/compiler/code hashes once and reuse + config_hash = vllm_config.compute_hash() + compiler_hash = self.compiler_manager.compute_hash(vllm_config) + forward_code_files = list(sorted(self.compilation_config.traced_files)) + + logger.debug( + "Traced files (to be considered for compilation cache):\n%s", + lazy(lambda: "\n".join(forward_code_files)), + ) + hash_content = [] + for filepath in forward_code_files: + hash_content.append(filepath) + if filepath == "": + # This means the function was dynamically generated, with + # e.g. exec(). We can't actually check these. + continue + try: + with open(filepath) as f: + hash_content.append(f.read()) + except Exception: + logger.warning("Failed to read file %s", filepath) + continue + code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest() + # Clear after consumption + self.compilation_config.traced_files.clear() if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, # the cache dir will be the same so that we can reuse the compiled # graph. - - factors = compilation_config_hash_factors(vllm_config) - # 2. factors come from the code files that are traced by Dynamo ( - # it mainly summarizes how the model is used in forward pass) - code_hash = _compute_code_hash(self.compilation_config.traced_files) - self.compilation_config.traced_files.clear() - factors.append(code_hash) - - # 3. compiler hash - compiler_hash = self.compiler_manager.compute_hash(vllm_config) - factors.append(compiler_hash) - - # combine all factors to generate the cache dir - hash_key = hashlib.md5( - str(factors).encode(), usedforsecurity=False - ).hexdigest()[:10] - + factors = [env_hash, config_hash, code_hash, compiler_hash] + # Use SHA-256 for cache key hashing to be consistent across + # compute_hash functions. Truncate for a short cache dir name. + hash_key = hashlib.sha256(str(factors).encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, - "torch_compile_cache", - hash_key, + envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key ) self.compilation_config.cache_dir = cache_dir @@ -621,6 +637,7 @@ def __call__( os.makedirs(local_cache_dir, exist_ok=True) self.compilation_config.local_cache_dir = local_cache_dir + # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE. disable_cache = not is_compile_cache_enabled( self.compilation_config.inductor_compile_config ) @@ -638,6 +655,50 @@ def __call__( local_cache_dir, disable_cache, self.prefix ) + # Reuses existing cache key + + logger.debug( + "torch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%s", + env_hash, + config_hash, + compiler_hash, + code_hash, + local_cache_dir, + ) + + # Persist and log only hash-relevant factors together. + try: + logger.debug( + "Compile env factors (raw):\n%s\nVllm config hash: %s", + lazy(partial(pprint.pformat, env_factors, width=120)), + config_hash, + ) + meta_path = os.path.join(local_cache_dir, "cache_key_factors.json") + if not os.path.exists(meta_path): + with open(meta_path, "w") as f: + json.dump( + { + "env": env_factors, # raw factors used for env_hash + "config_hash": config_hash, + "code_hash": code_hash, + "compiler_hash": compiler_hash, + }, + f, + indent=2, + sort_keys=True, + ) + except Exception: + # Best-effort only; metadata write failures are non-fatal. + logger.warning( + ( + "Could not write compile cache metadata at %s; continuing without " + "metadata. Compiled cache remains valid; diagnostics may be " + "limited." + ), + local_cache_dir, + exc_info=True, + ) + # when dynamo calls the backend, it means the bytecode # transform and analysis are done compilation_counter.num_graphs_seen += 1 diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 0e8bb2fc9735..fe2547d7feca 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -127,7 +127,7 @@ def uuid(self): affects compilation caching. Its uuid depends on the UUIDs of all dependent passes and the pass config. See InductorPass for more info. """ - state = {"pass_config": self.pass_config.uuid(), "passes": []} + state = {"pass_config": self.pass_config.compute_hash(), "passes": []} for pass_ in self.passes: state["passes"].append(pass_.uuid()) state["passes"].append(self.fix_functionalization.uuid()) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 864cf1be81b2..2652c7c06ad0 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from dataclasses import field from typing import TYPE_CHECKING, Any, Literal @@ -160,13 +159,29 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.cache_dtype) - factors.append(self.mamba_cache_dtype) - factors.append(self.mamba_ssm_cache_dtype) - # `cpu_offload_gb` does not use `torch.compile` yet. - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() - return hash_str + ignored_factors = { + # Runtime/derived knobs that don't affect compiled graph shape + "gpu_memory_utilization", + "swap_space", + "is_attention_free", + "num_gpu_blocks_override", + "enable_prefix_caching", + "prefix_caching_hash_algo", + # `cpu_offload_gb` does not use `torch.compile` yet. + "cpu_offload_gb", + "cpu_kvcache_space_bytes", + "mamba_page_size_padded", + # Post-init/derived counters + "num_gpu_blocks", + "num_cpu_blocks", + # WIP feature toggle not impacting compiled graph shape + "kv_sharing_fast_prefill", + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + return hash_factors(factors) def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 088d0b1af757..ca01cb3fb55d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -import hashlib from collections import Counter from collections.abc import Callable from dataclasses import asdict, field @@ -160,7 +159,7 @@ def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]: current_platform.get_device_capability().to_int(), {} ) - def uuid(self): + def compute_hash(self) -> str: """ Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. @@ -506,28 +505,33 @@ class CompilationConfig: def compute_hash(self) -> str: """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.mode) - factors.append(self.backend) - factors.append(self.custom_ops) - factors.append(self.splitting_ops) - factors.append(self.use_inductor) - factors.append(self.use_inductor_graph_partition) - factors.append(self.inductor_compile_config) - factors.append(self.inductor_passes) - factors.append(self.pass_config.uuid()) - factors.append(self.compile_cache_save_format) - return hashlib.sha256(str(factors).encode()).hexdigest() + # Opt-out: default-include declared fields; keep a tiny exclude set; + # normalize types; keep SHA-256. For nested opaque configs, include a + # stable identifier (e.g., pass_config.compute_hash()) instead of object id. + + ignored_factors = { + # Paths/dirs and runtime/metrics that don’t affect compiled graph + "debug_dump_path", + "cache_dir", + "local_cache_dir", + "bs_to_padded_graph_size", + "traced_files", + "compilation_time", + "static_forward_context", + "pass_config", # handled separately below + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + factors["pass_config"] = self.pass_config.compute_hash() + return hash_factors(factors) def __repr__(self) -> str: exclude = { diff --git a/vllm/config/model.py b/vllm/config/model.py index f61dbb6a695a..b563a40eb8fc 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib -import json import warnings from collections.abc import Callable from dataclasses import InitVar, field @@ -18,7 +16,7 @@ from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType -from vllm.config.utils import assert_hashable, config, getattr_iter +from vllm.config.utils import config, getattr_iter from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.transformers_utils.config import ( @@ -324,50 +322,50 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.model) - factors.append(self.dtype) - factors.append(self.quantization) - factors.append(self.revision) - factors.append(self.code_revision) - factors.append(self.max_model_len) - factors.append(self.max_logprobs) - factors.append(self.disable_sliding_window) - factors.append(self.trust_remote_code) - factors.append(self.generation_config) - factors.append(self.model_impl) - factors.append(self.override_generation_config) - factors.append(self.video_pruning_rate) - factors.append(self.enable_prompt_embeds) - - # hf_config can control how the model looks! - try: - hf_config_json = self.hf_config.to_json_string(use_diff=False) - except TypeError: - from transformers import PretrainedConfig - - from vllm.utils.jsontree import json_map_leaves - - # Handle nested HF configs with unserializable values gracefully - hf_config_json = ( - json.dumps( - json_map_leaves( - lambda v: v.to_dict() - if isinstance(v, PretrainedConfig) - else str(v), - self.hf_config.to_dict(), - ), - indent=2, - sort_keys=True, - ) - + "\n" - ) - - factors.append(hf_config_json) - - str_factors = str(factors) - assert_hashable(str_factors) - return hashlib.sha256(str(factors).encode()).hexdigest() + ignored_factors = { + "runner", + "convert", + "task", + "tokenizer", + "tokenizer_mode", + "seed", + "hf_config_path", + "allowed_local_media_path", + "allowed_media_domains", + "tokenizer_revision", + "spec_target_max_model_len", + "enforce_eager", + "logprobs_mode", + "disable_cascade_attn", + "skip_tokenizer_init", + "enable_prompt_embeds", + "served_model_name", + "config_format", + "hf_token", + "hf_overrides", + "logits_processor_pattern", + "enable_sleep_mode", + "override_attention_dtype", + "logits_processors", + "io_processor_plugin", + "pooler_config", + "override_pooler_config", + "multimodal_config", + "limit_mm_per_prompt", + "media_io_kwargs", + "mm_processor_kwargs", + "mm_processor_cache_gb", + "mm_processor_cache_type", + "mm_shm_cache_max_object_size_mb", + "mm_encoder_tp_mode", + "interleave_mm_strings", + "skip_mm_profiling", + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + return hash_factors(factors) def _update_nested( self, diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9a6326d62e82..0f107a7a3ef8 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib import os from typing import TYPE_CHECKING, Any, Literal @@ -448,19 +447,41 @@ def compute_hash(self): This hash is also used for DP worker configuration validation to prevent hangs from mismatched collective communication patterns. """ - factors: list[Any] = [] - factors.append(self.pipeline_parallel_size) - factors.append(self.tensor_parallel_size) - factors.append(self.enable_expert_parallel) - factors.append(self.data_parallel_size) - factors.append(self.all2all_backend) - factors.append(self.enable_eplb) - if self.enable_eplb: - factors.append(self.eplb_config.log_balancedness) - factors.append(self.eplb_config.window_size) - factors.append(self.eplb_config.step_interval) - factors.append(self.eplb_config.num_redundant_experts) - return hashlib.sha256(str(factors).encode()).hexdigest() + ignored_factors = { + # Derived/runtime topology, networking, or launch details + "data_parallel_rank", + "data_parallel_rank_local", + "data_parallel_backend", + "data_parallel_external_lb", + "data_parallel_hybrid_lb", + "data_parallel_master_ip", + "data_parallel_master_port", + "_data_parallel_master_port_list", + "data_parallel_rpc_port", + "rank", + "master_addr", + "master_port", + "node_rank", + "nnodes", + "max_parallel_loading_workers", + "disable_custom_all_reduce", + "ray_workers_use_nsight", + "ray_runtime_env", + "placement_group", + "distributed_executor_backend", + "worker_cls", + "sd_worker_cls", + "worker_extension_cls", + "_api_process_count", + "_api_process_rank", + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + # Explicitly include backend affecting env factor as before + factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND) + return hash_factors(factors) def __post_init__(self) -> None: # Set all2all_backend from env var if not specified, with deprecation warning diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 7e0878d96bbd..02f2b75f608f 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -3,14 +3,19 @@ """Utility functions for vLLM config dataclasses.""" import ast +import enum +import hashlib import inspect +import json +import pathlib import textwrap -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence, Set from dataclasses import MISSING, Field, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar import regex as re +import torch from pydantic.fields import FieldInfo from typing_extensions import runtime_checkable @@ -176,3 +181,115 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT: ) processed_overrides[field_name] = value return replace(config, **processed_overrides) + + +def normalize_value(x): + """Return a stable, JSON-serializable canonical form for hashing. + Order: primitives, special types (Enum, callable, torch.dtype, Path), then + generic containers (Mapping/Set/Sequence) with recursion. + """ + # Fast path + if x is None or isinstance(x, (bool, int, float, str)): + return x + + # Enums: tag with FQN to avoid primitive collisions. + # Ex: Enum(1) vs int(1) -> ("module.QualName", value). + if isinstance(x, enum.Enum): + enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}" + return (enum_type, normalize_value(x.value)) + + # Classes (types) are accepted and canonicalized by their fully-qualified + # name (module.qualname) for a stable identifier. + # Instances are only accepted if they expose uuid(); otherwise they are + # rejected to avoid under-hashing object state. + + # Callables: accept classes only; reject funcs/lambdas/methods. + # Used by LogitsProcessor types and ModelConfig.hf_overrides. + if isinstance(x, type): + module = getattr(x, "__module__", "") + qual = getattr(x, "__qualname__", getattr(x, "__name__", "")) + return ".".join([p for p in (module, qual) if p]) or repr(x) + + # Prefer stable uuid identifiers for objects that provide them, even if + # they are callable instances (e.g., InductorPass wrappers). + if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)): + return x.uuid() + + if callable(x): + raise TypeError("normalize_value: function or callable instance unsupported") + + # Torch dtype: stringify (torch.float64 -> "torch.float64"). + # We rely on the string form here; dtype-bearing fields that need additional + # disambiguation should encode that at the config layer. + if isinstance(x, torch.dtype): + return str(x) + + # Bytes + if isinstance(x, (bytes, bytearray)): + return x.hex() + + # Paths (canonicalize) + if isinstance(x, pathlib.Path): + try: + return str(x.expanduser().resolve()) + except Exception: + return str(x) + + # Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability. + if is_dataclass(x): + type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}" + items = tuple( + (f.name, normalize_value(getattr(x, f.name))) + for f in sorted(fields(x), key=lambda f: f.name) + ) + return (type_fqn, items) + + # Containers (generic) + if isinstance(x, Mapping): + return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items())) + if isinstance(x, Set): + return tuple(sorted(repr(normalize_value(v)) for v in x)) + if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)): + return tuple(normalize_value(v) for v in x) + + # PretrainedConfig + if hasattr(x, "to_json_string") and callable(x.to_json_string): + return x.to_json_string() + + # Unsupported type: e.g., modules, generators, open files, or objects + # without a stable JSON/UUID representation. Hard-error to avoid + # under-hashing. + # If you hit this, either reshape your config to use supported primitives + # and containers, or extend normalize_value to provide a stable encoding + # (e.g., via uuid() or to_json_string()) for this type. + raise TypeError( + f"normalize_value: unsupported type '{type(x).__name__}'. " + "Ensure config values use supported primitives/containers or add a " + "stable representation for this type." + ) + + +def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]: + """Gets the factors used for hashing a config class. + - Includes all dataclass fields not in `ignored_factors`. + - Errors on non-normalizable values. + """ + factors: dict[str, object] = {} + for dc_field in fields(config): + factor = dc_field.name + if factor in ignored_factors: + continue + value = getattr(config, factor, None) + try: + factors[factor] = normalize_value(value) + except TypeError as e: + raise TypeError( + f"get_hash_factors: unsupported type for key '{factor}' " + f"({type(value).__name__})" + ) from e + return factors + + +def hash_factors(items: dict[str, object]) -> str: + """Return a SHA-256 hex digest of the canonical items structure.""" + return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest() diff --git a/vllm/envs.py b/vllm/envs.py index e61fb114325c..212d68114e46 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -import hashlib import json +import logging import os import sys import tempfile @@ -426,6 +426,8 @@ def get_vllm_port() -> int | None: # --8<-- [start:env-vars-definition] +logger = logging.getLogger(__name__) + environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), @@ -1540,85 +1542,88 @@ def is_set(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -def compute_hash() -> str: - """ - WARNING: Whenever a new key is added to this environment - variables, ensure that it is included in the factors list if - it affects the computation graph. For example, different values - of VLLM_PP_LAYER_PARTITION will generate different computation - graphs, so it is included in the factors list. The env vars that - affect the choice of different kernels or attention backends should - also be included in the factors list. - """ - - # The values of envs may affects the computation graph. - # TODO(DefTruth): hash all environment variables? - # for key in environment_variables: - # factorize(key) - environment_variables_to_hash = [ - "VLLM_PP_LAYER_PARTITION", - "VLLM_MLA_DISABLE", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", - "VLLM_USE_TRITON_AWQ", - "VLLM_DP_RANK", - "VLLM_DP_SIZE", - "VLLM_USE_STANDALONE_COMPILE", - "VLLM_FUSED_MOE_CHUNK_SIZE", - "VLLM_FLASHINFER_MOE_BACKEND", - "VLLM_V1_USE_PREFILL_DECODE_ATTENTION", - "VLLM_ATTENTION_BACKEND", - "VLLM_USE_FLASHINFER_SAMPLER", - "VLLM_DISABLED_KERNELS", - "VLLM_USE_DEEP_GEMM", - "VLLM_MOE_USE_DEEP_GEMM", - "VLLM_USE_DEEP_GEMM_E8M0", - "VLLM_USE_FUSED_MOE_GROUPED_TOPK", - "VLLM_USE_FLASHINFER_MOE_FP16", - "VLLM_USE_FLASHINFER_MOE_FP8", - "VLLM_USE_FLASHINFER_MOE_FP4", - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", - "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", - "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", - "VLLM_USE_CUDNN_PREFILL", - "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", - "VLLM_USE_TRTLLM_ATTENTION", - "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", - "VLLM_ROCM_USE_AITER", - "VLLM_ROCM_USE_AITER_PAGED_ATTN", - "VLLM_ROCM_USE_AITER_LINEAR", - "VLLM_ROCM_USE_AITER_MOE", - "VLLM_ROCM_USE_AITER_RMSNORM", - "VLLM_ROCM_USE_AITER_MLA", - "VLLM_ROCM_USE_AITER_MHA", - "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", - "VLLM_ROCM_USE_AITER_TRITON_ROPE", - "VLLM_ROCM_USE_AITER_FP8BMM", - "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", - "VLLM_ROCM_USE_AITER_TRITON_GEMM", - "VLLM_ROCM_USE_SKINNY_GEMM", - "VLLM_ROCM_FP8_PADDING", - "VLLM_ROCM_MOE_PADDING", - "VLLM_ROCM_CUSTOM_PAGED_ATTN", - "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", - "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", - "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", - "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", - "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", - "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", - "VLLM_NVFP4_GEMM_BACKEND", - "VLLM_USE_FBGEMM", - "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", - "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", - ] - for key in environment_variables_to_hash: - # if this goes out of sync with environment_variables, - # it's not a user error, it's a bug - assert key in environment_variables, ( - "Please update environment_variables_to_hash in envs.py" - ) +def compile_factors() -> dict[str, object]: + """Return env vars used for torch.compile cache keys. + + Start with every known vLLM env var; drop entries in `ignored_factors`; + hash everything else. This keeps the cache key aligned across workers.""" + + ignored_factors: set[str] = { + "MAX_JOBS", + "VLLM_RPC_BASE_PATH", + "VLLM_USE_MODELSCOPE", + "VLLM_RINGBUFFER_WARNING_INTERVAL", + "VLLM_DEBUG_DUMP_PATH", + "VLLM_PORT", + "VLLM_CACHE_ROOT", + "LD_LIBRARY_PATH", + "VLLM_SERVER_DEV_MODE", + "VLLM_DP_MASTER_IP", + "VLLM_DP_MASTER_PORT", + "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", + "VLLM_CI_USE_S3", + "VLLM_MODEL_REDIRECT_PATH", + "VLLM_HOST_IP", + "S3_ACCESS_KEY_ID", + "S3_SECRET_ACCESS_KEY", + "S3_ENDPOINT_URL", + "VLLM_USAGE_STATS_SERVER", + "VLLM_NO_USAGE_STATS", + "VLLM_DO_NOT_TRACK", + "VLLM_LOGGING_LEVEL", + "VLLM_LOGGING_PREFIX", + "VLLM_LOGGING_STREAM", + "VLLM_LOGGING_CONFIG_PATH", + "VLLM_LOG_STATS_INTERVAL", + "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", + "VLLM_TUNED_CONFIG_FOLDER", + "VLLM_ENGINE_ITERATION_TIMEOUT_S", + "VLLM_HTTP_TIMEOUT_KEEP_ALIVE", + "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", + "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", + "VLLM_SLEEP_WHEN_IDLE", + "VLLM_IMAGE_FETCH_TIMEOUT", + "VLLM_VIDEO_FETCH_TIMEOUT", + "VLLM_AUDIO_FETCH_TIMEOUT", + "VLLM_MEDIA_URL_ALLOW_REDIRECTS", + "VLLM_MEDIA_LOADING_THREAD_COUNT", + "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", + "VLLM_VIDEO_LOADER_BACKEND", + "VLLM_MEDIA_CONNECTOR", + "VLLM_ASSETS_CACHE", + "VLLM_ASSETS_CACHE_MODEL_CLEAN", + "VLLM_MM_INPUT_CACHE_GIB", + "VLLM_WORKER_MULTIPROC_METHOD", + "VLLM_ENABLE_V1_MULTIPROCESSING", + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", + "VLLM_CPU_KVCACHE_SPACE", + "VLLM_CPU_OMP_THREADS_BIND", + "VLLM_CPU_NUM_OF_RESERVED_CPU", + "VLLM_CPU_MOE_PREPACK", + "VLLM_CPU_SGL_KERNEL", + "VLLM_TEST_FORCE_LOAD_FORMAT", + "LOCAL_RANK", + "CUDA_VISIBLE_DEVICES", + } + + from vllm.config.utils import normalize_value + + factors: dict[str, object] = {} + for factor, getter in environment_variables.items(): + if factor in ignored_factors: + continue + + try: + raw = getter() + except Exception as exc: # pragma: no cover - defensive logging + logger.warning( + "Skipping environment variable %s while hashing compile factors: %s", + factor, + exc, + ) + continue - factors = [environment_variables[key]() for key in environment_variables_to_hash] + factors[factor] = normalize_value(raw) ray_noset_env_vars = [ # Refer to @@ -1641,8 +1646,8 @@ def compute_hash() -> str: "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR", "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES", ] - factors.extend([os.getenv(var) for var in ray_noset_env_vars]) - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + for var in ray_noset_env_vars: + factors[var] = normalize_value(os.getenv(var)) - return hash_str + return factors diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 7202259ca21a..44b40ead973b 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -2,9 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.lazy import lazy from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "lazy", "logtime", ] diff --git a/vllm/logging_utils/lazy.py b/vllm/logging_utils/lazy.py new file mode 100644 index 000000000000..3ade79896285 --- /dev/null +++ b/vllm/logging_utils/lazy.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any + + +class lazy: + """Wrap a zero-argument callable evaluated only during log formatting.""" + + __slots__ = ("_factory",) + + def __init__(self, factory: Callable[[], Any]) -> None: + self._factory = factory + + def __str__(self) -> str: + return str(self._factory()) + + def __repr__(self) -> str: + return str(self) From 48fc8b1e595766af9c91edfc1de43f3a352575eb Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 19 Nov 2025 10:04:07 -0500 Subject: [PATCH 192/578] [BugFix] Fix async-scheduling + FlashAttn MLA (#28990) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/mla/common.py | 15 +++++++++------ vllm/v1/attention/backends/mla/flashattn_mla.py | 2 +- vllm/v1/attention/backends/utils.py | 1 + vllm/v1/worker/gpu_model_runner.py | 10 +++++++--- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 2ccdd1f143ce..e328049b53c7 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -755,6 +755,7 @@ def build( seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens + dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] @@ -944,18 +945,20 @@ def build( decode_metadata = None if num_decodes > 0: + dcp_tot_seq_lens_device = None + if self.dcp_world_size > 1: + dcp_tot_seq_lens_device = seq_lens[:num_decodes] + seq_lens_cpu = dcp_local_seq_lens_cpu + seq_lens = dcp_local_seq_lens + decode_metadata = self._build_decode( block_table_tensor=block_table_tensor[:num_decodes, ...], seq_lens_cpu=seq_lens_cpu[:num_decodes], - seq_lens_device=dcp_local_seq_lens[:num_decodes] - if self.dcp_world_size > 1 and dcp_local_seq_lens is not None - else seq_lens[:num_decodes], + seq_lens_device=seq_lens[:num_decodes], query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1], query_start_loc_device=query_start_loc[: num_decodes + 1], num_decode_tokens=num_decode_tokens, - dcp_tot_seq_lens_device=seq_lens[:num_decodes] - if self.dcp_world_size > 1 - else None, + dcp_tot_seq_lens_device=dcp_tot_seq_lens_device, ) attn_metadata = self.metadata_cls( diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 7794e89cc0a9..12639edc8b9a 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -173,7 +173,7 @@ def _build_decode( ) -> FlashAttnMLADecodeMetadata: query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] max_query_len = query_lens_cpu.max().item() - max_seq_len = seq_lens_device.max().item() + max_seq_len = seq_lens_cpu.max().item() # For Flash Attention MLA + full cudagraph max_num_splits = 0 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 578153cda786..0dd189633129 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -92,6 +92,7 @@ class CommonAttentionMetadata: encoder_seq_lens: np.ndarray | None = None dcp_local_seq_lens: torch.Tensor | None = None + dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 506118d2d762..3b00085b6bb9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1451,9 +1451,12 @@ def _build_attention_metadata( num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ :num_reqs ] - dcp_local_seq_lens = ( - self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None - ) + + dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None + if self.dcp_world_size > 1: + dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs] + dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs] + spec_decode_common_attn_metadata = None if for_cudagraph_capture: @@ -1521,6 +1524,7 @@ def _build_attention_metadata( causal=True, encoder_seq_lens=encoder_seq_lens, dcp_local_seq_lens=dcp_local_seq_lens, + dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu, ) if self.speculative_config and spec_decode_common_attn_metadata is None: From d44e9df7d49a9bb3400b002c38c06fae2dd7d1e8 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 20 Nov 2025 00:24:55 +0800 Subject: [PATCH 193/578] [Model][Mamba] Add selector for mamba attention backend and make it pluggable for other device (#26487) Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/contributing/model/basic.md | 1 + vllm/attention/__init__.py | 3 +- vllm/attention/backends/registry.py | 114 +++++++++++++++--- vllm/attention/selector.py | 33 ++++- vllm/model_executor/layers/kda.py | 8 +- vllm/model_executor/layers/mamba/abstract.py | 10 +- .../layers/mamba/linear_attn.py | 14 --- .../layers/mamba/mamba_mixer.py | 10 +- .../layers/mamba/mamba_mixer2.py | 9 -- .../model_executor/layers/mamba/short_conv.py | 9 -- vllm/model_executor/models/plamo2.py | 9 -- vllm/model_executor/models/qwen3_next.py | 9 +- 12 files changed, 144 insertions(+), 85 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index a7b54f015c2d..d7f5d2f311a3 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -146,6 +146,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. +It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index dd35165d5415..8b4dc4013362 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -7,7 +7,7 @@ AttentionType, ) from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend +from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend __all__ = [ "Attention", @@ -15,4 +15,5 @@ "AttentionMetadata", "AttentionType", "get_attn_backend", + "get_mamba_attn_backend", ] diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index f07a6059be37..51899b023591 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend registry""" -import enum from collections.abc import Callable +from enum import Enum, EnumMeta from typing import TYPE_CHECKING, cast from vllm.logger import init_logger @@ -15,7 +15,7 @@ logger = init_logger(__name__) -class _AttentionBackendEnumMeta(enum.EnumMeta): +class _AttentionBackendEnumMeta(EnumMeta): """Metaclass for AttentionBackendEnum to provide better error messages.""" def __getitem__(cls, name: str): @@ -23,15 +23,15 @@ def __getitem__(cls, name: str): try: return super().__getitem__(name) except KeyError: - members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values() - valid_backends = ", ".join(m.name for m in members) + members = cast("dict[str, Enum]", cls.__members__).keys() + valid_backends = ", ".join(members) raise ValueError( f"Unknown attention backend: '{name}'. " f"Valid options are: {valid_backends}" ) from None -class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta): +class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): """Enumeration of all supported attention backends. The enum value is the default class path, but this can be overridden @@ -83,7 +83,7 @@ def get_path(self, include_classname: bool = True) -> str: Raises: ValueError: If Backend.CUSTOM is used without being registered """ - path = _OVERRIDES.get(self, self.value) + path = _ATTN_OVERRIDES.get(self, self.value) if not path: raise ValueError( f"Backend {self.name} must be registered before use. " @@ -111,18 +111,93 @@ def is_overridden(self) -> bool: Returns: True if the backend has a registered override """ - return self in _OVERRIDES + return self in _ATTN_OVERRIDES def clear_override(self) -> None: """Clear any override for this backend, reverting to the default.""" - _OVERRIDES.pop(self, None) + _ATTN_OVERRIDES.pop(self, None) -_OVERRIDES: dict[AttentionBackendEnum, str] = {} +class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): + """Enumeration of all supported mamba attention backends. + + The enum value is the default class path, but this can be overridden + at runtime using register_backend(). + + To get the actual backend class (respecting overrides), use: + backend.get_class() + """ + + MAMBA1 = "vllm.v1.attention.backends.mamba1_attn.Mamba1AttentionBackend" + MAMBA2 = "vllm.v1.attention.backends.mamba2_attn.Mamba2AttentionBackend" + SHORT_CONV = "vllm.v1.attention.backends.short_conv_attn.ShortConvAttentionBackend" + LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend" + GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend" + # Placeholder for third-party/custom backends - must be registered before use + CUSTOM = "" + + def get_path(self, include_classname: bool = True) -> str: + """Get the class path for this backend (respects overrides). + + Returns: + The fully qualified class path string + + Raises: + ValueError: If Backend.CUSTOM is used without being registered + """ + path = _MAMBA_ATTN_OVERRIDES.get(self, self.value) + if not path: + raise ValueError( + f"Backend {self.name} must be registered before use. " + f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')" + ) + if not include_classname: + path = path.rsplit(".", 1)[0] + return path + + def get_class(self) -> "type[AttentionBackend]": + """Get the backend class (respects overrides). + + Returns: + The backend class + + Raises: + ImportError: If the backend class cannot be imported + ValueError: If Backend.CUSTOM is used without being registered + """ + return resolve_obj_by_qualname(self.get_path()) + + def is_overridden(self) -> bool: + """Check if this backend has been overridden. + + Returns: + True if the backend has a registered override + """ + return self in _MAMBA_ATTN_OVERRIDES + + def clear_override(self) -> None: + """Clear any override for this backend, reverting to the default.""" + _MAMBA_ATTN_OVERRIDES.pop(self, None) + + +MAMBA_TYPE_TO_BACKEND_MAP = { + "mamba1": MambaAttentionBackendEnum.MAMBA1.name, + "mamba2": MambaAttentionBackendEnum.MAMBA2.name, + "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name, + "linear_attention": MambaAttentionBackendEnum.LINEAR.name, + "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name, + "custom": MambaAttentionBackendEnum.CUSTOM.name, +} + + +_ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {} +_MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {} def register_backend( - backend: AttentionBackendEnum, class_path: str | None = None + backend: AttentionBackendEnum | MambaAttentionBackendEnum, + is_mamba: bool = False, + class_path: str | None = None, ) -> Callable[[type], type]: """Register or override a backend implementation. @@ -135,12 +210,17 @@ def register_backend( Decorator function if class_path is None, otherwise a no-op Examples: - # Override an existing backend + # Override an existing attention backend @register_backend(AttentionBackendEnum.FLASH_ATTN) class MyCustomFlashAttn: ... - # Register a custom third-party backend + # Override an existing mamba attention backend + @register_backend(MambaAttentionBackendEnum.LINEAR, is_mamba=True) + class MyCustomMambaAttn: + ... + + # Register a custom third-party attention backend @register_backend(AttentionBackendEnum.CUSTOM) class MyCustomBackend: ... @@ -153,11 +233,17 @@ class MyCustomBackend: """ def decorator(cls: type) -> type: - _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" + if is_mamba: + _MAMBA_ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" # type: ignore[index] + else: + _ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" # type: ignore[index] return cls if class_path is not None: - _OVERRIDES[backend] = class_path + if is_mamba: + _MAMBA_ATTN_OVERRIDES[backend] = class_path # type: ignore[index] + else: + _ATTN_OVERRIDES[backend] = class_path # type: ignore[index] return lambda x: x return decorator diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 1a092db9ce37..e9af08b2316d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -12,7 +12,11 @@ import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.backends.registry import ( + MAMBA_TYPE_TO_BACKEND_MAP, + AttentionBackendEnum, + MambaAttentionBackendEnum, +) from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.utils import STR_BACKEND_ENV_VAR @@ -197,6 +201,33 @@ def _cached_get_attn_backend( return backend +def get_mamba_attn_backend( + mamba_type: str, +) -> type[AttentionBackend]: + """Select which mamba attention backend to use and lazily import it.""" + return _cached_get_mamba_attn_backend(mamba_type) + + +@cache +def _cached_get_mamba_attn_backend( + mamba_type: str, +) -> type[AttentionBackend]: + assert mamba_type and isinstance(mamba_type, str) + + selected_backend = None + try: + backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type] + selected_backend = MambaAttentionBackendEnum[backend_name] + except KeyError as e: + raise ValueError( + f"Invalid mamba attention backend type: '{backend_name}'. Valid " + f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}" + ) from e + + mamba_attn_backend = selected_backend.get_class() + return mamba_attn_backend + + @contextmanager def global_force_attn_backend_context_manager( attn_backend: AttentionBackendEnum, diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 2e7500bac718..27cc3884517f 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -5,7 +5,6 @@ from einops import rearrange from torch import nn -from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( @@ -83,12 +82,7 @@ def kda_attention_fake( class KimiDeltaAttention(nn.Module, MambaBase): @property def mamba_type(self) -> str: - return "linear_attention" - - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend - - return GDNAttentionBackend + return "gdn_attention" def get_state_dtype( self, diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index e68b09b4d81f..aa919d6fdc35 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -6,6 +6,7 @@ import torch +from vllm.attention.selector import get_mamba_attn_backend from vllm.config import VllmConfig from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec @@ -38,11 +39,6 @@ def get_state_shape(self) -> Iterable[tuple[int, ...]]: def mamba_type(self) -> str: pass - @abstractmethod - def get_attn_backend(self) -> type["AttentionBackend"]: - """Get the attention backend class for this Mamba layer.""" - pass - @abstractmethod def get_state_dtype(self) -> tuple[torch.dtype, ...]: pass @@ -69,3 +65,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None: else 0 ), ) + + def get_attn_backend(self) -> type["AttentionBackend"]: + """Get the attention backend class for this Mamba layer.""" + return get_mamba_attn_backend(self.mamba_type) diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 0a2742ff49a4..d85b3e61c5d6 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -2,12 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -37,9 +31,6 @@ from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - class MiniMaxText01RMSNormTP(CustomOp): name = "MiniMaxText01RMSNormTP" @@ -123,11 +114,6 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): def mamba_type(self) -> str: return "linear_attention" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend - - return LinearAttentionBackend - def get_state_dtype(self) -> tuple[torch.dtype]: assert self.model_config is not None assert self.cache_config is not None diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index b6345b8af7f0..90e520e24441 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,10 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, NamedTuple - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend +from typing import NamedTuple import torch from torch import nn @@ -452,11 +449,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: def mamba_type(self) -> str: return "mamba1" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend - - return Mamba1AttentionBackend - def _time_proj_bias(self) -> torch.Tensor | None: if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None: return self.dt_proj.bias.float() diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 57313990b820..900701c46348 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,10 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch from torch import nn @@ -908,11 +904,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: def mamba_type(self) -> str: return "mamba2" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend - - return Mamba2AttentionBackend - def mamba_mixer2( projected_states: torch.Tensor, diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 04efa8a8b373..0bbad17d7ebc 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -1,10 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch @@ -232,11 +228,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...]]: def mamba_type(self) -> str: return "short_conv" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend - - return ShortConvAttentionBackend - def short_conv( hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 0c87f5000ff4..52c9755e0e0e 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -4,10 +4,6 @@ from collections.abc import Iterable from itertools import islice -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch from torch import nn @@ -467,11 +463,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: def mamba_type(self) -> str: return "mamba2" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend - - return Mamba2AttentionBackend - def plamo2_mamba_mixer( hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 0415c8e00fdf..ad631f61e4b9 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,7 @@ from torch import nn from transformers.activations import ACT2FN -from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, @@ -216,12 +216,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): @property def mamba_type(self) -> str: - return "linear_attention" - - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend - - return GDNAttentionBackend + return "gdn_attention" def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: return MambaStateDtypeCalculator.gated_delta_net_state_dtype( From a8b70304d68497ac1c432a2ff343e9bfb516c227 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:06:36 +0100 Subject: [PATCH 194/578] Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 6 +- benchmarks/kernels/benchmark_mrope.py | 19 ++-- .../offline_inference/context_extension.py | 6 +- tests/compile/test_functionalization.py | 4 +- tests/kernels/core/test_mrope.py | 16 +-- tests/kernels/core/test_pos_encoding.py | 39 +++---- .../moe/test_gpt_oss_triton_kernels.py | 2 +- .../pooling/test_nomic_max_model_len.py | 16 +-- tests/test_config.py | 37 ++++--- vllm/config/model.py | 63 +++++------ .../layers/rotary_embedding/__init__.py | 76 ++++++------- vllm/model_executor/models/afmoe.py | 17 +-- vllm/model_executor/models/apertus.py | 22 +--- vllm/model_executor/models/arcee.py | 11 -- vllm/model_executor/models/arctic.py | 3 +- vllm/model_executor/models/baichuan.py | 8 +- vllm/model_executor/models/bailing_moe.py | 3 +- vllm/model_executor/models/bamba.py | 6 +- vllm/model_executor/models/chameleon.py | 29 +---- vllm/model_executor/models/chatglm.py | 3 +- vllm/model_executor/models/commandr.py | 5 +- vllm/model_executor/models/config.py | 22 ++-- vllm/model_executor/models/dbrx.py | 7 +- vllm/model_executor/models/deepseek_v2.py | 43 +++----- vllm/model_executor/models/dots1.py | 11 +- vllm/model_executor/models/ernie45_moe.py | 14 +-- vllm/model_executor/models/ernie45_vl_moe.py | 13 +-- vllm/model_executor/models/exaone.py | 21 +--- vllm/model_executor/models/exaone4.py | 19 +--- vllm/model_executor/models/falcon.py | 3 +- vllm/model_executor/models/falcon_h1.py | 8 +- vllm/model_executor/models/gemma.py | 8 +- vllm/model_executor/models/gemma2.py | 5 +- vllm/model_executor/models/gemma3.py | 21 ++-- vllm/model_executor/models/gemma3n.py | 20 ++-- vllm/model_executor/models/glm4.py | 10 +- vllm/model_executor/models/glm4_1v.py | 1 - vllm/model_executor/models/glm4_moe.py | 11 +- vllm/model_executor/models/gpt_j.py | 3 +- vllm/model_executor/models/gpt_neox.py | 3 +- vllm/model_executor/models/gpt_oss.py | 13 ++- vllm/model_executor/models/granite.py | 17 +-- vllm/model_executor/models/granitemoe.py | 13 +-- .../model_executor/models/granitemoehybrid.py | 5 +- .../model_executor/models/granitemoeshared.py | 6 +- vllm/model_executor/models/grok1.py | 11 +- vllm/model_executor/models/hunyuan_v1.py | 25 +---- vllm/model_executor/models/internlm2.py | 12 +-- vllm/model_executor/models/internlm2_ve.py | 5 +- vllm/model_executor/models/kimi_linear.py | 5 - vllm/model_executor/models/lfm2.py | 17 +-- vllm/model_executor/models/lfm2_moe.py | 17 +-- vllm/model_executor/models/llama.py | 22 +--- vllm/model_executor/models/llama4.py | 11 +- vllm/model_executor/models/longcat_flash.py | 22 ++-- vllm/model_executor/models/minicpm.py | 12 +-- vllm/model_executor/models/minicpm3.py | 10 +- vllm/model_executor/models/minicpm_eagle.py | 5 +- vllm/model_executor/models/minimax_m2.py | 12 +-- vllm/model_executor/models/minimax_text_01.py | 9 +- vllm/model_executor/models/mixtral.py | 7 +- vllm/model_executor/models/mllama4.py | 8 +- vllm/model_executor/models/molmo.py | 3 +- vllm/model_executor/models/nemotron.py | 17 +-- vllm/model_executor/models/nemotron_nas.py | 19 +--- vllm/model_executor/models/olmo.py | 3 +- vllm/model_executor/models/olmo2.py | 13 +-- vllm/model_executor/models/olmoe.py | 6 +- vllm/model_executor/models/openpangu.py | 26 ++--- vllm/model_executor/models/orion.py | 12 +-- vllm/model_executor/models/ouro.py | 11 +- vllm/model_executor/models/persimmon.py | 3 +- vllm/model_executor/models/phi.py | 6 +- vllm/model_executor/models/phimoe.py | 18 ++-- vllm/model_executor/models/plamo2.py | 7 +- vllm/model_executor/models/qwen.py | 11 +- vllm/model_executor/models/qwen2.py | 16 +-- vllm/model_executor/models/qwen2_5_vl.py | 1 - vllm/model_executor/models/qwen2_moe.py | 12 +-- vllm/model_executor/models/qwen2_vl.py | 1 - vllm/model_executor/models/qwen3.py | 15 +-- vllm/model_executor/models/qwen3_moe.py | 12 +-- vllm/model_executor/models/qwen3_next.py | 3 +- .../models/qwen3_omni_moe_thinker.py | 1 - vllm/model_executor/models/qwen3_vl.py | 1 - vllm/model_executor/models/seed_oss.py | 15 +-- vllm/model_executor/models/solar.py | 18 +--- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 3 +- vllm/model_executor/models/step3_text.py | 16 ++- .../models/transformers/utils.py | 10 +- vllm/model_executor/models/zamba2.py | 4 +- vllm/transformers_utils/config.py | 100 +++++++++++++----- vllm/transformers_utils/configs/afmoe.py | 7 +- vllm/transformers_utils/configs/arctic.py | 18 +++- vllm/transformers_utils/configs/flex_olmo.py | 17 +-- .../transformers_utils/configs/kimi_linear.py | 12 ++- vllm/transformers_utils/configs/lfm2_moe.py | 12 ++- .../transformers_utils/configs/midashenglm.py | 2 +- vllm/transformers_utils/configs/mistral.py | 4 +- vllm/transformers_utils/configs/nemotron.py | 60 ++++++----- vllm/transformers_utils/configs/olmo3.py | 12 ++- vllm/transformers_utils/configs/qwen3_next.py | 17 +-- vllm/transformers_utils/configs/step3_vl.py | 12 ++- 104 files changed, 544 insertions(+), 912 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e62cd60efaec..d4b6f4077ab3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -872,12 +872,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579..83bd91917508 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,7 +6,7 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # speedup # @@ -86,9 +86,8 @@ def benchmark_mrope( num_heads: int, num_kv_heads: int, max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -203,9 +201,8 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -255,9 +252,8 @@ def benchmark_mrope( "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -303,7 +299,7 @@ def benchmark_mrope( q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +311,8 @@ def benchmark_mrope( num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index df39e4c25d5c..67d33e1881ee 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This script demonstrates how to extend the context length -of a Qwen model using the YARN method (rope_scaling) +of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. Usage: @@ -19,8 +19,8 @@ def create_llm(): # Use yarn to extend context hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 11ae96e930da..515e0a93ac2a 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, q, k): @@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, hidden_states): diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 02b795721f46..43b242ab2d58 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -5,11 +5,11 @@ import pytest import torch from packaging.version import Version -from transformers import AutoConfig from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -98,8 +98,7 @@ def test_mrope( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -113,7 +112,6 @@ def test_mrope( ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -122,9 +120,8 @@ def test_mrope( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) @@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing( else config.hidden_size // total_num_heads ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index c35ee5016ba0..a8ed3825689d 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -74,7 +74,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: float = 10000, + rope_theta: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -83,7 +83,8 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -120,9 +121,9 @@ def test_rotary_embedding( @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] - BASES = [10000, 1000000] - ROPE_SCALINGS = ( - None, + ROPE_THETAS = [10000, 1000000] + ROPE_PARAMETERS = ( + {"rope_type": "default"}, {"rope_type": "linear", "factor": (1,)}, {"rope_type": "dynamic", "factor": 1}, ) @@ -130,9 +131,9 @@ def test_rope_module_cache(): HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, - BASES, + ROPE_THETAS, IS_NEOX_STYLE, - ROPE_SCALINGS, + ROPE_PARAMETERS, DTYPES, ) rope_setting_id_map: dict[str, int] = {} @@ -141,20 +142,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # different settings cannot share the same rope module @@ -168,20 +169,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # check if cache take effect diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index dfd317bcf72f..af33fd4e3fc3 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -201,7 +201,7 @@ class ModelConfig: sliding_window: int = 128 initial_context_length: int = 4096 rope_theta: float = 150000.0 - rope_scaling_factor: float = 32.0 + rope_parameters_factor: float = 32.0 rope_ntk_alpha: float = 1.0 rope_ntk_beta: float = 32.0 diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 88f088c60327..d6216a87a229 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 +from typing import Any + import pytest from ...utils import EmbedModelInfo @@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_illegal(model_info, vllm_runner): - hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + hf_overrides: dict[str, Any] = { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): pass hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db2..16f68d18fc68 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config(): def test_rope_customization(): - TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} - TEST_ROPE_THETA = 16_000_000.0 - LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} + TEST_ROPE_PARAMETERS = { + "rope_theta": 16_000_000.0, + "rope_type": "dynamic", + "factor": 2.0, + } + LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"} + LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0} llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct") - assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None - assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 + assert ( + getattr(llama_model_config.hf_config, "rope_parameters", None) + == LLAMA_ROPE_PARAMETERS + ) assert llama_model_config.max_model_len == 8192 llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, - "rope_theta": TEST_ROPE_THETA, - }, + hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS}, ) assert ( - getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING + getattr(llama_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) - assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 longchat_model_config = ModelConfig("lmsys/longchat-13b-16k") - # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config + # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config assert all( - longchat_model_config.hf_config.rope_scaling.get(key) == value - for key, value in LONGCHAT_ROPE_SCALING.items() + longchat_model_config.hf_config.rope_parameters.get(key) == value + for key, value in LONGCHAT_ROPE_PARAMETERS.items() ) assert longchat_model_config.max_model_len == 16384 longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_PARAMETERS, }, ) assert ( - getattr(longchat_model_config.hf_config, "rope_scaling", None) - == TEST_ROPE_SCALING + getattr(longchat_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/config/model.py b/vllm/config/model.py index b563a40eb8fc..d1e56a72a318 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -11,6 +11,7 @@ from pydantic import ConfigDict, SkipValidation, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig @@ -2100,31 +2101,32 @@ def _get_and_verify_max_len( ) derived_max_model_len = default_max_len - rope_scaling = getattr(hf_config, "rope_scaling", None) + # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. + # To simplify the verification, we convert it to dict[str, TypedDict]. + rope_parameters = getattr(hf_config, "rope_parameters", None) + if rope_parameters and not set(rope_parameters.keys()).issubset( + ALLOWED_LAYER_TYPES + ): + rope_parameters = {"": rope_parameters} + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] - - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate." - ) - - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) - - if rope_type == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] - derived_max_model_len *= scaling_factor + if rope_parameters is not None and "gemma3" not in hf_config.model_type: + scaling_factor = 1.0 + for rp in rope_parameters.values(): + # No need to consider "type" key because of patch_rope_parameters when + # loading HF config + rope_type = rp["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + # NOTE: This assumes all layer types have the same scaling factor. + scaling_factor = rp.get("factor", scaling_factor) + + if rope_type == "yarn": + derived_max_model_len = rp["original_max_position_embeddings"] + # Do this outside loop since all layer types should have the same scaling + derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: derived_max_model_len = encoder_config["max_seq_length"] @@ -2134,7 +2136,9 @@ def _get_and_verify_max_len( if max_model_len is None: # For LongRoPE, default to original_max_position_embeddings to avoid # performance degradation for shorter sequences - if rope_scaling is not None and rope_scaling["rope_type"] == "longrope": + if rope_parameters is not None and any( + rp["rope_type"] == "longrope" for rp in rope_parameters.values() + ): max_model_len = int( getattr( hf_config, "original_max_position_embeddings", derived_max_model_len @@ -2151,16 +2155,7 @@ def _get_and_verify_max_len( # that will be bigger than derived_max_model_len. We compare user input # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate." - ) - else: + if model_max_length is None or max_model_len > model_max_length: msg = ( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 56c165f9c041..ae8a7d93b50e 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -26,23 +26,23 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: float, is_neox_style: bool = True, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - if rope_scaling is not None: + if rope_parameters is not None: # Transforms every value that is a list into a tuple for caching calls - rope_scaling_tuple = { - k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_parameters.items() } - rope_scaling_args = tuple(rope_scaling_tuple.items()) + rope_parameters_args = tuple(rope_parameters_tuple.items()) else: - rope_scaling_args = None + rope_parameters_args = None if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -60,15 +60,15 @@ def get_rope( head_size, rotary_dim, max_position, - base, is_neox_style, - rope_scaling_args, + rope_parameters_args, dual_chunk_attention_args, dtype, ) if key in _ROPE_DICT: return _ROPE_DICT[key] + base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -84,18 +84,18 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_scaling: + elif not rope_parameters: rotary_emb = RotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) else: - scaling_type = rope_scaling["rope_type"] + scaling_type = rope_parameters["rope_type"] if scaling_type == "llama3": - scaling_factor = rope_scaling["factor"] - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] rotary_emb = Llama3RotaryEmbedding( head_size, rotary_dim, @@ -113,7 +113,7 @@ def get_rope( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) elif scaling_type == "default": - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: rotary_emb = MRotaryEmbedding( head_size, rotary_dim, @@ -121,8 +121,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), ) else: rotary_emb = RotaryEmbedding( @@ -134,7 +134,7 @@ def get_rope( dtype, ) elif scaling_type == "linear": - scaling_factor = rope_scaling["factor"] + scaling_factor = rope_parameters["factor"] rotary_emb = LinearScalingRotaryEmbedding( head_size, rotary_dim, @@ -145,8 +145,8 @@ def get_rope( dtype, ) elif scaling_type == "ntk": - scaling_factor = rope_scaling["factor"] - mixed_b = rope_scaling.get("mixed_b", None) + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b") rotary_emb = NTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -158,8 +158,8 @@ def get_rope( mixed_b, ) elif scaling_type == "dynamic": - if "alpha" in rope_scaling: - scaling_alpha = rope_scaling["alpha"] + if "alpha" in rope_parameters: + scaling_alpha = rope_parameters["alpha"] rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, @@ -169,8 +169,8 @@ def get_rope( scaling_alpha, dtype, ) - elif "factor" in rope_scaling: - scaling_factor = rope_scaling["factor"] + elif "factor" in rope_parameters: + scaling_factor = rope_parameters["factor"] rotary_emb = DynamicNTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -185,11 +185,11 @@ def get_rope( "Dynamic rope scaling must contain either 'alpha' or 'factor' field" ) elif scaling_type == "yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -199,7 +199,7 @@ def get_rope( "apply_yarn_scaling", ) } - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: extra_kwargs.pop("apply_yarn_scaling", None) rotary_emb = MRotaryEmbedding( head_size, @@ -208,8 +208,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), scaling_factor=scaling_factor, **extra_kwargs, ) @@ -225,12 +225,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "deepseek_yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] # assert max_position == original_max_position * scaling_factor extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -252,12 +252,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "longrope": - short_factor = rope_scaling["short_factor"] - long_factor = rope_scaling["long_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ("short_mscale", "long_mscale") } rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 6f654f47495f..4eb5665a71fc 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -5,7 +5,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -202,7 +199,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # Check if this is a local attention layer @@ -246,8 +242,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config["rope_parameters"], is_neox_style=True, ) else: @@ -303,14 +298,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix @@ -323,8 +310,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 0a8f21abb0a3..b75e91319bba 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -118,8 +117,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -155,7 +152,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,9 +172,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -224,7 +218,6 @@ def forward( def _init_rotary_emb( self, config: ApertusConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -236,8 +229,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -253,14 +245,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -288,8 +272,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 20c3ff075450..b3887b16f4d7 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -103,15 +103,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Rotary embedding parameters (reuse LLaMA defaults) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Determine if attention bias is needed (some variants use bias terms) attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -133,8 +124,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b5cc07a56535..b75a254761d4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -292,7 +292,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.scaling = self.head_dim**-0.5 self.qkv_proj = QKVParallelLinear( @@ -317,7 +316,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8991ef4c606b..edf47270e527 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -136,7 +136,7 @@ def __init__( hidden_size: int, num_heads: int, position_embedding: str, - rope_theta: float = 10000, + rope_parameters: dict, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -150,7 +150,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = hidden_size // self.total_num_heads self.position_embedding = position_embedding - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # pylint: disable=invalid-name @@ -192,7 +191,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( @@ -229,13 +228,12 @@ def __init__( ): super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 024425bb2440..cc10e936a2d3 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -135,9 +135,8 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, - rope_scaling=config.rope_scaling, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index c6cc83487fec..4422bb5da98f 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -156,8 +156,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -178,7 +176,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -192,8 +189,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3c87bbfefab3..b5a6d00dc309 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -265,8 +265,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -293,7 +292,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -318,8 +316,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -369,14 +366,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -385,8 +374,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, @@ -439,14 +427,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -455,8 +435,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5d6f5e9125a2..dbfcd62d0bca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,6 +99,7 @@ def __init__( # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope @@ -106,7 +107,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, - base=10000 * rope_ratio, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 77bb17851981..5ed920927c77 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -156,8 +156,6 @@ def __init__( self.max_position_embeddings = getattr( config, "model_max_length", None ) or getattr(config, "max_position_embeddings", 8192) - self.rope_theta = config.rope_theta - self.rope_scaling = getattr(config, "rope_scaling", None) self.use_qk_norm = getattr(config, "use_qk_norm", False) self.qkv_proj = QKVParallelLinear( self.hidden_size, @@ -179,8 +177,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 66b246878b0a..3cf4bf991e66 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform +from vllm.transformers_utils.config import set_default_rope_theta from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -46,8 +47,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -78,12 +78,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -117,18 +118,20 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) + + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } # we ignore config.rotary_scaling_factor so that for datasets shorter # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. - # The context extension uses vllm style rope_theta and rope_scaling. + # The context extension uses vllm style rope_theta and rope_parameters. # See #17785 #18755 if ( not vllm_config.model_config.hf_overrides @@ -172,7 +175,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") hf_text_config.max_position_embeddings = max_trained_positions - hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] # The priority of sentence_bert_config.json is higher # than max_position_embeddings @@ -246,8 +249,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 528ef4f76742..2c729019081a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -197,7 +197,10 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads self.total_num_kv_heads = config.attn_config.kv_n_heads self.clip_qkv = config.attn_config.clip_qkv - self.rope_theta = config.attn_config.rope_theta + rope_parameters = { + "rope_type": "default", + "rope_theta": int(config.attn_config.rope_theta), + } self.max_position = config.max_seq_len # pylint: disable=invalid-name @@ -221,7 +224,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e8ee9951d611..6675b2133f38 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ def __init__( config: DeepseekV2Config | DeepseekV3Config, hidden_size: int, num_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -139,7 +136,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -162,8 +158,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -409,8 +404,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -430,7 +423,6 @@ def __init__( assert num_heads % tp_size == 0 self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings assert topk_indices_buffer is None, ( "topk_indices_buffer is not \ @@ -485,21 +477,20 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -903,8 +894,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -927,7 +916,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: @@ -981,19 +969,18 @@ def __init__( prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -1073,8 +1060,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) moe_layer_freq = getattr(config, "moe_layer_freq", 1) # DecoderLayers are created with `make_layers` which passes the prefix @@ -1107,8 +1092,6 @@ def __init__( v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index d24da0c42a25..e65c275106a4 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -202,8 +201,6 @@ def __init__( num_heads: int, num_kv_heads: int, config: Dots1Config, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -229,7 +226,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings attention_bias = config.attention_bias @@ -255,8 +251,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -296,8 +291,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = layer_idx @@ -307,8 +300,6 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, config=config, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f2999968669f..a7df3509e3ec 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -62,6 +62,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( @@ -232,9 +233,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, @@ -266,7 +266,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -291,9 +290,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=False, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -333,16 +331,14 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index e8ef86f9b7f0..50e033d77606 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -58,6 +58,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .ernie45_moe import Ernie4_5_MoeMLP from .interfaces import SupportsPP @@ -91,9 +92,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, freq_allocation: int = 20, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, @@ -126,7 +126,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -155,7 +154,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position_embeddings=max_position_embeddings, - base=rope_theta, + base=rope_parameters["rope_theta"], is_neox_style=False, dtype=torch.get_default_dtype(), mrope_section=[h_rope, w_rope, t_rope], @@ -413,8 +412,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) @@ -423,8 +421,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, freq_allocation=freq_allocation, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 6c56bfc433c7..d13275488fe9 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -113,8 +112,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -144,7 +141,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -173,8 +169,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -207,8 +202,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -221,8 +214,6 @@ def __init__( hidden_size=hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=bias, @@ -251,14 +242,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -272,8 +255,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index b89e168ada20..70f3cce2b7c5 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -23,7 +23,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -52,6 +51,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -110,8 +110,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 1000000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -141,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,12 +173,12 @@ def __init__( # apply rotary embeddings to every layer in full attention models self.apply_rope_all_layers = "sliding_attention" not in config.layer_types + set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -227,14 +224,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -249,8 +238,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 85acdff3d96b..dc2d51f340c8 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -164,13 +164,12 @@ def __init__( ) if self.use_rotary: - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index b985847af5da..9433f0d1b4a4 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -35,6 +35,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import ( HasInnerState, @@ -214,8 +215,7 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 1e11) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1e11) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -240,7 +240,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -254,8 +253,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=None, # see impl of get_rope ) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 7aaae7c503b5..00c7f59a0809 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -20,6 +20,7 @@ from collections.abc import Iterable from functools import cache from itertools import islice +from typing import Any import torch from torch import nn @@ -127,8 +128,8 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -153,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -176,7 +176,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -218,7 +218,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4d5d6cbb37c6..9b6cfe693230 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -107,7 +107,6 @@ def __init__( num_kv_heads: int, head_dim: int, max_position_embeddings: int, - rope_theta: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, @@ -134,7 +133,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.query_pre_attn_scalar**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -156,7 +154,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, ) @@ -206,7 +204,6 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 357e61a4e78b..565719ae7fae 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -155,25 +155,28 @@ def __init__( self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) layer_idx = extract_layer_index(prefix) - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + self.is_sliding = layer_type == "sliding_attention" sliding_window = config.sliding_window if self.is_sliding else None # Initialize the rotary embedding. - if self.is_sliding: - # Local attention. Override the values in config.json. - self.rope_theta = config.rope_local_base_freq - self.rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if self.is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq + self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) if getattr(config, "is_causal", True): diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 64443190f53e..8f1447ba34a8 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -332,18 +332,21 @@ def __init__( ) layer_idx = extract_layer_index(prefix) - is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" self.sliding_window = config.sliding_window if is_sliding else None # Initialize the rotary embedding. - if is_sliding: - # Local attention. Override the values in config.json. - rope_theta = config.rope_local_base_freq - rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq first_kv_shared_layer_idx = ( config.num_hidden_layers - config.num_kv_shared_layers @@ -383,9 +386,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index faa0674a2e43..f8ef3b0385fb 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -57,10 +57,8 @@ def __init__( max_position: int = 4096 * 32, head_dim: int | None = None, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -86,7 +84,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -107,8 +104,7 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) @@ -150,8 +146,6 @@ def __init__( quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Glm4Attention( config=config, @@ -159,12 +153,10 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7a4fee76ae6b..6581bbda6d60 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -703,7 +703,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1422dbe9b3cd..5aa51af54a00 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -26,7 +26,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -233,8 +232,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -264,7 +261,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = use_qk_norm @@ -291,8 +287,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( @@ -341,8 +336,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. @@ -354,8 +347,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e416ecde0c1e..e94de8952fa6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,13 +95,12 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index af0c9209231c..815c2fba4d9f 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,13 +92,12 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7df3b087ccb8..f310f71af92d 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,16 +67,16 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, dtype=torch.float32, - rope_scaling={ + rope_parameters={ + "rope_theta": config.rope_parameters["rope_theta"], "rope_type": "yarn", - "factor": config.rope_scaling["factor"], - "original_max_position_embeddings": config.rope_scaling[ + "factor": config.rope_parameters["factor"], + "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" ], - "beta_fast": config.rope_scaling["beta_fast"], - "beta_slow": config.rope_scaling["beta_slow"], + "beta_fast": config.rope_parameters["beta_fast"], + "beta_slow": config.rope_parameters["beta_slow"], }, is_neox_style=True, ) @@ -90,7 +90,6 @@ def __init__( self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.qkv_proj = QKVParallelLinear( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c44b4021471e..1dc205b47753 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -112,8 +111,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -143,7 +140,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -167,8 +163,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,14 +199,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -225,8 +212,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5c6759ded066..8f4139d63c3f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -141,8 +141,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -172,7 +171,6 @@ def __init__( if attention_multiplier is not None else self.head_dim**-1 ) - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -194,9 +192,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -235,16 +232,12 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index a340112ec62a..9d5eeef198a6 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -273,10 +273,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=int(config.rope_theta), - rope_scaling=config.rope_scaling - if hasattr(config, "rope_scaling") and config.rope_scaling is not None - else None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 926c539af33b..fd346db7e35a 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -84,16 +84,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 9dc231863f74..4bf23cd6fd19 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from itertools import islice +from typing import Any import torch import torch.nn.functional as F @@ -134,7 +135,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -161,7 +162,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -183,7 +183,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) @@ -234,15 +234,12 @@ def __init__( if not self.use_fp8 and hasattr(quant_config, "is_fp8"): self.use_fp8 = quant_config.is_fp8 - # Requires transformers > 4.32.0 - # Default rope_theta value if not in config - rope_theta = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 1eadcbe67ade..9fa5e2bd33f2 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import regex as re import torch @@ -142,8 +141,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -177,7 +174,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -204,8 +200,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -254,8 +249,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -289,7 +282,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -314,8 +306,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -494,14 +485,6 @@ def __init__( if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False @@ -520,8 +503,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, @@ -537,8 +518,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 60fbeb842dd4..dc8f821bd134 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -91,8 +91,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -120,7 +119,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.key_value_groups = int(self.num_heads / self.num_kv_heads) self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.wqkv = QKVParallelLinear( @@ -144,8 +142,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,15 +201,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6dc081e34157..a57db82242af 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -30,15 +30,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index f3675075a48f..4562b2202c5e 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -190,9 +189,7 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, use_nope: bool = False, - rope_scaling: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -210,11 +207,9 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None - assert rope_scaling is None assert num_heads % tp_size == 0 self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index aeb25602f11a..74bdde27ece5 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -96,8 +95,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -126,7 +123,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -149,8 +145,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -199,14 +194,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2Attention( @@ -215,8 +202,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6b7b5564ee98..c088a0821152 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -189,8 +188,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -219,7 +216,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -242,8 +238,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -293,14 +288,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2MoeAttention( @@ -309,8 +296,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 0a3f37c30ab5..d5b49d2fb4c2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -120,8 +119,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -157,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings llama_4_scaling_config = getattr(config, "llama_4_scaling", None) @@ -186,9 +182,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -258,7 +252,6 @@ def forward( def _init_rotary_emb( self, config: LlamaConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -270,8 +263,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -291,14 +283,6 @@ def __init__( quant_config = self.get_quant_config(vllm_config) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -326,8 +310,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a7e0732ec71e..4c6d1d424475 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -19,7 +19,6 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -208,7 +205,6 @@ def __init__( self.floor_scale = getattr(config, "floor_scale", 8192.0) self.attn_scale = getattr(config, "attn_scale", 0.1) - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.n_rep = self.num_heads // self.num_kv_heads self.qk_norm = ( @@ -248,8 +244,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=int(rope_theta), - rope_scaling=rope_scaling if rope_scaling != "default" else None, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) if not self.nope @@ -331,8 +326,6 @@ def __init__( self.layer_idx = extract_layer_index(prefix) self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling max_position_embeddings = config.max_position_embeddings self.self_attn = Llama4Attention( @@ -340,8 +333,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 5de10e708683..fafe97cd2be7 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -108,8 +108,7 @@ def __init__( eos_token_id=100001, pretraining_tp=1, tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, mla_scale_q_lora=False, @@ -162,8 +161,13 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mla_scale_q_lora = mla_scale_q_lora @@ -336,15 +340,7 @@ def __init__( super().__init__() self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) # Dual attention structure self.self_attn = nn.ModuleList( @@ -361,8 +357,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=None diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 914b097fe199..04923833065f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -230,8 +230,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -257,7 +256,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -281,8 +279,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -324,8 +321,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -339,8 +334,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index d3b6966ee3a7..2d775219fc97 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -25,8 +25,6 @@ # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any - import torch from torch import nn from transformers import PretrainedConfig @@ -62,8 +60,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -84,7 +80,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.q_a_proj = ReplicatedLinear( @@ -127,8 +122,7 @@ def __init__( self.qk_rope_head_dim, rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_local_heads, @@ -204,8 +198,6 @@ def _init_attn_block(self): v_head_dim=self.config.v_head_dim, q_lora_rank=self.config.q_lora_rank, kv_lora_rank=self.config.kv_lora_rank, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index d0cdb70aa857..e6bccfcac4f1 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -69,8 +69,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -84,8 +82,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 49d2f2d26196..4955c68c0cda 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -149,8 +149,7 @@ def __init__( num_heads: int, num_kv_heads: int, rotary_dim: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, @@ -180,7 +179,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -205,8 +203,7 @@ def __init__( self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -252,8 +249,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = max( @@ -269,8 +264,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rotary_dim=config.rotary_dim, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index bf1ecc822756..50f7396e2de6 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -188,7 +188,7 @@ def __init__( num_kv_heads: int, rotary_dim: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict | None = None, sliding_window: int | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, @@ -214,7 +214,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.sliding_window = sliding_window self.prefix = prefix @@ -247,7 +246,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=int(rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, dtype=torch.float32, ) @@ -287,8 +286,6 @@ def __init__( self.hidden_size = config.hidden_size self.expert_num = expert_num - rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads @@ -328,7 +325,7 @@ def __init__( else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, quant_config=quant_config, layer_idx=self._ilayer, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d7a1cb82fb4f..54ab8dd493e7 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -161,7 +161,6 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,7 +188,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -211,7 +209,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -248,15 +246,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = MixtralAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e25a104d822a..286859d188d3 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -292,13 +292,17 @@ def __init__( prefix=f"{prefix}.o_proj", ) + rope_parameters = { + "rope_type": "mllama4", + "rope_theta": config.rope_parameters["rope_theta"], + } + self.rotary_emb = get_rope( head_size=self.head_dim, rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, - base=config.rope_theta, - rope_scaling={"rope_type": "mllama4"}, + rope_parameters=rope_parameters, is_neox_style=False, dtype=torch.complex64, # important ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ab83a271e30a..dc06938d5d6e 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -410,7 +410,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -437,7 +436,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 92dcf5ea5700..c3337bd1ea69 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -150,8 +149,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -181,7 +178,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings @@ -206,8 +202,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( @@ -243,14 +238,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -264,8 +251,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b839206a3094..2eebe38051cb 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -82,8 +81,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -97,8 +94,6 @@ def __init__( hidden_size, num_heads, num_kv_heads, - rope_theta, - rope_scaling, max_position_embeddings, quant_config, bias, @@ -111,7 +106,6 @@ def __init__( def _init_rotary_emb( self, config, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: # Enables YARN for Mistral and LLaMA4 derivatives. @@ -126,8 +120,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -148,14 +141,6 @@ def __init__( self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -176,8 +161,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 487e3f671a45..bd8a8e317544 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -87,7 +87,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) @@ -105,7 +104,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 045582c889ee..f0f6b2f6b3e6 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -99,7 +99,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = self.config.max_position_embeddings - self.rope_theta = self.config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -139,15 +138,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=f"{prefix}.attn", ) - # Rotary embeddings. Rope scaling is only applied on full attention - # layers. - self.rope_scaling = self.config.rope_scaling if sliding_window is None else None + # Rotary embeddings. Rope scaling is only applied on full attention layers. + if sliding_window is None: + rope_parameters = self.config.rope_parameters + else: + rope_theta = self.config.rope_parameters["rope_theta"] + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) # Attention output projection. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 499eb05de76e..c39e338d72e2 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -123,8 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) num_heads = config.num_attention_heads @@ -148,7 +146,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,8 +173,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index d13a745beffe..f814cdfec5a2 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -77,6 +77,7 @@ sequence_parallel_chunk, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta def check_ffn_act_fn(act_fn: str): @@ -259,7 +260,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -274,8 +274,6 @@ def __init__( self.v_head_dim = v_head_dim self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank - self.rope_theta = rope_theta - self.tp_size = get_tensor_model_parallel_world_size() if num_heads % self.tp_size != 0: raise ValueError( @@ -339,7 +337,9 @@ def __init__( ) # TODO: remove hard coding - rope_scaling = { + set_default_rope_theta(config, default_theta=10000) + rope_parameters = { + "rope_theta": config.rope_parameters["rope_theta"], "beta_fast": 32, "beta_slow": 1, "factor": 1, @@ -353,8 +353,7 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) @@ -407,8 +406,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -454,7 +451,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -475,9 +471,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -521,7 +515,6 @@ def forward( def _init_rotary_emb( self, config: PretrainedConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -533,8 +526,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) @@ -555,7 +547,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) @@ -579,7 +570,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, @@ -607,8 +597,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=getattr(config, "rope_scaling", None), max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 859cd2cecf89..b30be93ca726 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -88,8 +88,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -115,7 +114,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -139,8 +137,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -175,15 +172,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 9db6c317c26a..63d2fff6ec8b 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -112,10 +112,8 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -140,7 +138,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config # Get total_ut_steps from config, default to 4 if not specified @@ -170,8 +167,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = nn.ModuleList() @@ -226,9 +222,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -244,10 +237,8 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3bf6a1d9763d..98963d52e484 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True @@ -138,7 +137,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 8fee53c23fb4..da476f621627 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,16 +115,12 @@ def __init__( ) assert rotary_dim % 2 == 0 - # pylint: disable=C0301 - # Refer to: - # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = getattr(config, "rope_theta", 10000.0) max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 92fd858b608b..8ffac95d9396 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -86,7 +86,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters=None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=2, @@ -119,7 +119,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + if rope_parameters is None: + rope_theta = kwargs.pop("rope_theta", 1e6) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -302,12 +304,11 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, head_dim: int | None = None, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: dict | None = None, prefix: str = "", ) -> None: super().__init__() @@ -332,8 +333,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.qkv_proj = QKVParallelLinear( hidden_size, @@ -355,9 +354,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) self.attn = Attention( self.num_heads, @@ -393,7 +391,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = PhiMoEAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -402,10 +399,9 @@ def __init__( head_dim=getattr( config, "head_dim", self.hidden_size // config.num_attention_heads ), - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = PhiMoE( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 52c9755e0e0e..22f9c87fc905 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -567,10 +567,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) - self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -581,8 +577,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, ) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm.weight = torch.nn.Parameter( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 50a125c3f597..c973e7917098 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -83,8 +83,7 @@ def __init__( hidden_size: int, num_heads: int, max_position_embeddings: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -117,8 +116,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -153,14 +151,11 @@ def __init__( super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 1bbb969ce5aa..32b6d6dd07b8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -57,7 +57,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( @@ -114,11 +114,10 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -143,7 +142,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -167,8 +165,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) attn_cls = ( @@ -216,9 +213,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -237,10 +232,9 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5b5d50ec8935..8e3c0e84dfe5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,7 +641,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 2ff0d19df238..6b97d0b2ca2e 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -194,8 +194,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -222,7 +221,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -248,8 +246,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -291,8 +288,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -301,8 +296,7 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cda8eaf5377f..d25ff2785bfe 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -643,7 +643,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 8d7f22a33fe6..93a629d81e8f 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP @@ -57,14 +58,13 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, max_position: int = 4096 * 32, head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -89,7 +89,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -113,8 +112,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -166,9 +164,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -187,13 +183,12 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 96751fee800b..8ee3dd99e11d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -216,8 +216,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -247,7 +246,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -273,8 +271,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -326,8 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -336,8 +331,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ad631f61e4b9..bfed64728305 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -748,8 +748,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index d2fd74a5e41a..54ef56f83344 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -338,7 +338,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 0c546309400b..c10aeaec5ab8 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -345,7 +345,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index bf211d28f184..4744d8e44f39 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -54,6 +54,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -112,11 +113,10 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -140,7 +140,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -163,8 +162,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -200,9 +198,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) # By default, SeedOss uses causal attention as it is a # decoder-only model. @@ -219,10 +215,9 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 4ec855f79444..7e9fc51036d2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -25,7 +25,6 @@ """Inference-only Solar model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -142,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -166,8 +162,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -202,15 +197,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -224,8 +210,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 06eb7201c1a8..a738fcbb4ee2 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -153,7 +153,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + rope_parameters=self.config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 0f2942acd500..1118fca3cac9 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -91,7 +91,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias @@ -115,7 +114,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 4fff356b29e2..3c377a2c539d 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -36,6 +36,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.step3_vl import Step3TextConfig from .interfaces import SupportsPP from .utils import ( @@ -144,9 +145,8 @@ def __init__( num_heads: int, num_kv_heads: int, norm_eps: float, - rope_theta: int, + rope_parameters: dict[str, Any], share_q_dim: int | None = None, - rope_scaling: dict[str, Any] | None = None, max_position_embedding: int = 8192, head_dim: int = 256, cache_config: CacheConfig | None = None, @@ -198,8 +198,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embedding, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) scaling = self.head_dim**-0.5 self.attn = Attention( @@ -227,15 +226,13 @@ def forward( class Step3TextDecoderLayer(nn.Module): def __init__( self, - config: ModelConfig, + config: Step3TextConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: super().__init__() - config = config.hf_config self.hidden_size = config.hidden_size - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Step3TextAttention( hidden_size=self.hidden_size, @@ -247,8 +244,7 @@ def __init__( max_position_embedding=config.max_position_embedding, head_dim=config.head_dim, share_q_dim=config.share_q_dim, - rope_theta=config.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -338,7 +334,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( - config=vllm_config.model_config, + config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 517eb54d53ac..b807f45b5d52 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,6 +22,7 @@ import torch from torch import nn +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: """ text_config = vllm_config.model_config.hf_config.get_text_config() # Dynamic rope scaling is not compatible with torch.compile - rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} - return rope_scaling.get("rope_type") != "dynamic" + rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} + if rope_parameters: + # Nest rope_parameters if not nested already to simplify logic + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) + return True diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 729a9655d087..653b5b9beef7 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -128,7 +128,6 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_hybrid_layers = num_hybrid_layers - self.rope_theta = config.rope_theta self.attention_hidden_size = config.attention_hidden_size self.total_num_attention_heads = config.num_attention_heads @@ -233,8 +232,7 @@ def __init__( head_size=self.attention_head_dim, rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, - base=self.rope_theta, - rope_scaling=None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ac4a71648cec..4ca155af03dc 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,8 +7,9 @@ from collections.abc import Callable from dataclasses import asdict from functools import cache, partial +from importlib.metadata import version from pathlib import Path -from typing import Any, Literal, TypeVar +from typing import Any, Literal, TypeAlias, TypeVar import huggingface_hub from huggingface_hub import ( @@ -24,7 +25,9 @@ RepositoryNotFoundError, RevisionNotFoundError, ) +from packaging.version import Version from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -390,21 +393,61 @@ def file_or_path_exists( ) -def patch_rope_scaling(config: PretrainedConfig) -> None: +def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None: + """Some models may have no rope_theta in their config but still use RoPE. + This function sets a default rope_theta if it's missing.""" + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = default_theta + + +def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - text_config = getattr(config, "text_config", None) - if text_config is not None: - patch_rope_scaling(text_config) + # Retrieve rope_parameters differently based on Transformers version + if Version(version("transformers")) >= Version("5.0.0.dev0"): + from transformers.modeling_rope_utils import RopeParameters - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None: - patch_rope_scaling_dict(rope_scaling) + rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( + config, "rope_parameters", None + ) + elif hasattr(config, "rope_parameters"): + # We are in Transformers v4 and rope_parameters + # has already been patched for this config + return + else: + # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters + rope_theta: float | None = getattr(config, "rope_theta", None) + rope_scaling: dict | None = getattr(config, "rope_scaling", None) + rope_parameters = rope_scaling + # Move rope_theta into rope_parameters + if rope_theta is not None: + rope_parameters = rope_parameters or {"rope_type": "default"} + rope_parameters["rope_theta"] = rope_theta + # Add original_max_position_embeddings if present + if rope_parameters and ( + ompe := getattr(config, "original_max_position_embeddings", None) + ): + rope_parameters["original_max_position_embeddings"] = ompe + # Write back to config + config.rope_parameters = rope_parameters + + # No RoPE parameters to patch + if rope_parameters is None: + return + + # Handle nested rope_parameters in interleaved sliding attention models + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in rope_parameters.values(): + patch_rope_parameters_dict(rope_parameters_layer_type) + else: + patch_rope_parameters_dict(rope_parameters) -def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: - if "rope_type" in rope_scaling and "type" in rope_scaling: - rope_type = rope_scaling["rope_type"] - rope_type_legacy = rope_scaling["type"] +def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: + if "rope_type" in rope_parameters and "type" in rope_parameters: + rope_type = rope_parameters["rope_type"] + rope_type_legacy = rope_parameters["type"] if rope_type != rope_type_legacy: raise ValueError( f"Found conflicts between 'rope_type={rope_type}' (modern " @@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: "You should only specify one of them." ) - if "rope_type" not in rope_scaling and "type" in rope_scaling: - rope_scaling["rope_type"] = rope_scaling["type"] + if "rope_type" not in rope_parameters and "type" in rope_parameters: + rope_parameters["rope_type"] = rope_parameters["type"] logger.info("Replacing legacy 'type' key with 'rope_type'") - if "rope_type" not in rope_scaling: - raise ValueError("rope_scaling should have a 'rope_type' key") + if "rope_type" not in rope_parameters: + raise ValueError("rope_parameters should have a 'rope_type' key") - if rope_scaling["rope_type"] == "su": - rope_scaling["rope_type"] = "longrope" + if rope_parameters["rope_type"] == "su": + rope_parameters["rope_type"] = "longrope" logger.warning("Replacing legacy rope_type 'su' with 'longrope'") - elif rope_scaling["rope_type"] == "mrope": - assert "mrope_section" in rope_scaling - rope_scaling["rope_type"] = "default" + elif rope_parameters["rope_type"] == "mrope": + assert "mrope_section" in rope_parameters + rope_parameters["rope_type"] = "default" logger.warning("Replacing legacy rope_type 'mrope' with 'default'") def _uses_mrope(config: PretrainedConfig) -> bool: - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is None: + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is None: return False - return "mrope_section" in rope_scaling + return "mrope_section" in rope_parameters def uses_mrope(config: PretrainedConfig) -> bool: @@ -690,7 +733,14 @@ def get_config( logger.debug("Overriding HF config with %s", hf_overrides_fn) config = hf_overrides_fn(config) - patch_rope_scaling(config) + # Exhaustively patch RoPE parameters everywhere they might be + patch_rope_parameters(config) + patch_rope_parameters(config.get_text_config()) + SubConfigs: TypeAlias = dict[str, PretrainedConfig] + sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) + if sub_configs: + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) if trust_remote_code: maybe_register_config_serialize_by_value() diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py index 9b634fd037a3..47fee9882f9f 100644 --- a/vllm/transformers_utils/configs/afmoe.py +++ b/vllm/transformers_utils/configs/afmoe.py @@ -24,7 +24,7 @@ def __init__( rms_norm_eps: float = 1e-5, use_cache: bool = True, tie_word_embeddings: bool = False, - rope_theta: float = 10000.0, + rope_parameters: dict | None = None, rope_scaling: dict | None = None, num_experts: int = 64, num_experts_per_tok: int = 6, @@ -56,7 +56,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 10000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.rope_scaling = rope_scaling self.moe_intermediate_size = moe_intermediate_size diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 1707e15285c8..ba4b1a8f701f 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -132,7 +139,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters: dict[str, Any] | None = None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=1, @@ -165,7 +172,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1e6) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 1f2f4d446288..c343dc0999a8 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -25,8 +26,7 @@ def __init__( bos_token_id=None, eos_token_id=100257, tie_word_embeddings=False, - rope_theta=500000.0, - rope_scaling=None, + rope_parameters: dict[str, Any] | None = None, attention_bias=False, attention_dropout=0.0, num_experts_per_tok=5, @@ -62,8 +62,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -73,5 +78,5 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + if self.rope_parameters is not None and "type" in self.rope_parameters: + self.rope_parameters["rope_type"] = self.rope_parameters["type"] diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index 65ddf48c5249..14894816801d 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -29,8 +29,7 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, tie_word_embeddings=False, moe_intermediate_size: int | None = None, moe_renormalize: bool = True, @@ -73,8 +72,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py index 37c038e12db8..b399a03c030f 100644 --- a/vllm/transformers_utils/configs/lfm2_moe.py +++ b/vllm/transformers_utils/configs/lfm2_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. max_position_embeddings (`int`, *optional*, defaults to 128000): The maximum sequence length that this model might ever be used with. use_cache (`bool`, *optional*, defaults to `True`): @@ -100,7 +101,7 @@ def __init__( bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 128_000, use_cache: bool = True, norm_eps: float = 0.00001, @@ -121,7 +122,10 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py index e49bd26b2b00..f1bbd057103e 100644 --- a/vllm/transformers_utils/configs/midashenglm.py +++ b/vllm/transformers_utils/configs/midashenglm.py @@ -98,6 +98,6 @@ def __init__( if text_config else Qwen2_5OmniTextConfig() ) - self.text_config.rope_scaling = None # uses_mrope is false + self.text_config.rope_parameters = None # uses_mrope is false self.audio_token_id = audio_token_id super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index c6f04febe37e..8f72f0b28b0d 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict: "apply_scale": "apply_yarn_scaling", } yarn_config = config.get("yarn") or {} - config["rope_scaling"] = { + config["rope_parameters"] = { "rope_type": "yarn", "mscale_all_dim": 1, } for old_name, new_name in yarn_config_map.items(): if old_name in yarn_config: - config["rope_scaling"][new_name] = yarn_config.pop(old_name) + config["rope_parameters"][new_name] = yarn_config.pop(old_name) assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 60eed549561f..d112c71d7d20 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): @@ -132,8 +132,7 @@ def __init__( bos_token_id=2, eos_token_id=3, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, @@ -160,8 +159,13 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") @@ -169,7 +173,7 @@ def __init__( or partial_rotary_factor ) self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_validation() + self._rope_parameters_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias @@ -182,31 +186,29 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - "`rope_scaling` must be a dictionary with two fields, " - f"`type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - "`rope_scaling`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_scaling_type}" - ) - if ( - rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0 - ): + rope_type: str | None = self.rope_parameters.get("rope_type", None) + factor: float | None = self.rope_parameters.get("factor", None) + + if rope_type not in {"default", "linear", "dynamic"}: raise ValueError( - "`rope_scaling`'s factor field must be a float > 1, got " - f"{rope_scaling_factor}" + "`rope_type` must be one of ['default', 'linear', 'dynamic'], " + f"got {rope_type}" ) + if rope_type != "default": + if factor is None: + raise ValueError( + "If `rope_type` is not 'default', `rope_parameters` " + "must include a `factor` field. Got `None`." + ) + if not isinstance(factor, float) or factor <= 1.0: + raise ValueError( + "`rope_parameters`'s factor field must be a float > 1, got " + f"{factor}" + ) diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index f5a9a7cd36bd..c4691b661af3 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -24,8 +24,7 @@ def __init__( bos_token_id=None, eos_token_id=50279, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, rms_norm_eps=1e-5, @@ -63,8 +62,13 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index 21750bde2f87..d2fe58d48da6 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_parameters (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. `rope_type` (`str`): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. @@ -199,8 +198,7 @@ def __init__( rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.25, attention_bias=False, attention_dropout=0.0, @@ -236,8 +234,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index 637b82d88e26..0ee650a70451 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -52,8 +52,7 @@ def __init__( moe_intermediate_size: int = 5120, moe_num_experts: int = 48, moe_top_k: int = 3, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 65536, share_expert_dim: int = 5120, share_q_dim: int = 2048, @@ -130,8 +129,13 @@ def __init__( self.moe_intermediate_size = moe_intermediate_size self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.max_position_embedding = max_position_embedding self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim From 0c80efd94fb8c17cfc7d1bcb9cdb65f154340994 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Thu, 20 Nov 2025 01:32:55 +0800 Subject: [PATCH 195/578] GLM-V video segmentation solution adjustment (#28941) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_1v.py | 94 +++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 6581bbda6d60..d141e9549806 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -37,7 +37,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import BatchFeature +from transformers import BatchFeature, Glm4vProcessor from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, @@ -1028,7 +1028,7 @@ def get_num_frames_with_most_features( return max(max_frames_per_video, 1) - def _get_video_second_idx( + def _get_video_second_idx_glm4v( self, metadata: dict[str, Any], total_frames: int ) -> list[int]: video_processor = self.get_video_processor() @@ -1079,6 +1079,83 @@ def _get_video_second_idx( selected_timestamps.append(timestamps_list[idx]) return selected_timestamps + def _get_video_second_idx_glm46v( + self, metadata: dict[str, Any], total_frames: int + ) -> list[int]: + video_processor = self.get_video_processor() + + video_fps = metadata["fps"] + meta_frames = metadata.get("total_num_frames", total_frames) + max_frame_idx = meta_frames - 1 + duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1) + + do_sample_frames = metadata.get("do_sample_frames", True) + if not do_sample_frames: + frame_indices = metadata["frames_indices"] + else: + DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5} + MAX_FRAME_COUNT_DYNAMIC = 640 + MAX_DURATION = 2400 + + effective_duration = min(duration, MAX_DURATION) + if effective_duration <= 30: + target_fps = DYNAMIC_FPS_THRES[30] + elif effective_duration <= 300: + target_fps = DYNAMIC_FPS_THRES[300] + else: + target_fps = DYNAMIC_FPS_THRES[2400] + + temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1) + extract_t = int(effective_duration * target_fps * temporal_patch_size) + extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC) + + duration_per_frame = 1 / video_fps + timestamps = [i * duration_per_frame for i in range(meta_frames)] + max_second = int(duration) + + if meta_frames < extract_t: + frame_indices = np.linspace( + 0, meta_frames - 1, extract_t, dtype=int + ).tolist() + else: + frame_indices = [] + current_second = 0.0 + inv_fps = 1 / (temporal_patch_size * target_fps) + for frame_index in range(meta_frames): + if timestamps[frame_index] >= current_second: + current_second += inv_fps + frame_indices.append(frame_index) + if current_second >= max_second: + break + + if len(frame_indices) < extract_t: + if len(frame_indices) == 0: + start, end = 0, max(meta_frames - 1, 0) + else: + start, end = frame_indices[0], frame_indices[-1] + frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist() + elif len(frame_indices) > extract_t: + frame_indices = np.linspace( + 0, meta_frames - 1, extract_t, dtype=int + ).tolist() + + seen, uniq = set(), [] + for idx in frame_indices: + if idx not in seen: + seen.add(idx) + uniq.append(idx) + + if len(uniq) & 1: + uniq.append(uniq[-1]) + + frame_indices = uniq + full_second_idxs = [int(idx / video_fps) for idx in frame_indices] + timestamps_list = full_second_idxs[::2] + selected_timestamps = [] + for idx in range(len(timestamps_list)): + selected_timestamps.append(timestamps_list[idx]) + return selected_timestamps + def _construct_video_placeholder( self, video_array: np.ndarray, @@ -1097,9 +1174,18 @@ def _construct_video_placeholder( merge_length = image_processor.merge_size**2 assert isinstance(grid_thw, torch.Tensor) - timestamps = self._get_video_second_idx(metadata, len(video_array)) + timestamps = ( + self._get_video_second_idx_glm4v(metadata, len(video_array)) + if isinstance(hf_processor, Glm4vProcessor) + else self._get_video_second_idx_glm46v(metadata, len(video_array)) + ) + + timestamp_format = ( + "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds" + ) frames_idx_token = [ - tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps + tokenizer.encode(timestamp_format.format(i), add_special_tokens=False) + for i in timestamps ] T, H, W = grid_thw num_tokens_per_frame = int(H * W) // merge_length From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:32:19 -0500 Subject: [PATCH 196/578] Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič --- .buildkite/test-pipeline.yaml | 2 ++ tests/compile/distributed/test_fusions_e2e.py | 12 ++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d4b6f4077ab3..98daebcc0693 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -930,6 +930,8 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 2e1b595a4389..661172e1965b 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -47,12 +47,8 @@ class ModelBackendTestCase(NamedTuple): ModelBackendTestCase( # Use smaller model for L40s in CI model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell - # so FI attention+fp8_quant is at least tested once model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=32, allreduce_fusion=65, @@ -65,9 +61,9 @@ class ModelBackendTestCase(NamedTuple): model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), # TODO FlashInfer attn broken on Hopper with kvcache=fp8: # https://github.com/vllm-project/vllm/issues/28568 - # TODO FlashInfer attn broken on Blackwell for llama4: - # https://github.com/vllm-project/vllm/issues/28604 - backend=AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=48, allreduce_fusion=96, From 3319a493fcc3e4733382f0dc812184234e9c3dcb Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 19 Nov 2025 11:20:22 -0800 Subject: [PATCH 197/578] [Core] Reuse created spec tokens lists to mitigate GC cost (#28917) Signed-off-by: Jialin Ouyang --- vllm/v1/worker/gpu_input_batch.py | 18 ++++++++++++------ vllm/v1/worker/gpu_model_runner.py | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 023b5edb2c34..c1bfe727d86e 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -251,7 +251,7 @@ def __init__( self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids # Store last speculative tokens for sampler. - self.spec_token_ids: list[list[int] | None] = [] + self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)] # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @@ -313,7 +313,7 @@ def add_request( else: self._req_ids[req_index] = req_id self.req_output_token_ids[req_index] = request.output_token_ids - self.spec_token_ids[req_index] = [] + self.spec_token_ids[req_index].clear() self.req_id_to_index[req_id] = req_index @@ -462,7 +462,7 @@ def remove_request(self, req_id: str) -> int | None: self.batch_update_builder.removed_append(req_index) self._req_ids[req_index] = None self.req_output_token_ids[req_index] = None - self.spec_token_ids[req_index] = None + self.spec_token_ids[req_index].clear() # LoRA lora_id = self.request_lora_mapping[req_index] @@ -654,9 +654,15 @@ def condense(self) -> None: self.req_output_token_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - spec_token_ids = self.spec_token_ids[last_req_index] - self.spec_token_ids[empty_index] = spec_token_ids - self.spec_token_ids[last_req_index] = None + if last_req_index != empty_index: + ( + self.spec_token_ids[last_req_index], + self.spec_token_ids[empty_index], + ) = ( + self.spec_token_ids[empty_index], + self.spec_token_ids[last_req_index], + ) + self.spec_token_ids[last_req_index].clear() num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3b00085b6bb9..0c35f1330e9f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -892,7 +892,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # conform to the schema. This can result in # scheduler_output.scheduled_spec_decode_tokens being empty, # even when speculative decoding is enabled. - self.input_batch.spec_token_ids[req_index] = spec_token_ids + self.input_batch.spec_token_ids[req_index].clear() + self.input_batch.spec_token_ids[req_index].extend(spec_token_ids) # there are no draft tokens with async scheduling, # we clear the spec_decoding info in scheduler_output and From fe69f331f84d99541564dfe4852dd45220ed7875 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:23:54 -0500 Subject: [PATCH 198/578] [Kernels] Improve H200 Fused MoE Config (#28992) Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 6fcf408755f5..532c16e89926 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,11 +1,11 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "2": { "BLOCK_SIZE_M": 16, @@ -13,82 +13,82 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 5 }, "48": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, "96": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, "128": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, "256": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -96,10 +96,10 @@ "num_stages": 3 }, "512": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -109,7 +109,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, @@ -117,21 +117,21 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "2048": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "3072": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 } From 9d2d5612573c20f8bf00242a8525c2a5dcfe4c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?= <38908462+zhyajie@users.noreply.github.com> Date: Thu, 20 Nov 2025 03:30:57 +0800 Subject: [PATCH 199/578] [Bugfix] Fix precision corruption when shared_experts_stream=None (#28942) Signed-off-by: zhyajie Co-authored-by: zhyajie --- vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++---- vllm/utils/torch_utils.py | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c41995e4a913..8e9bba344287 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -371,8 +371,8 @@ def __init__( logger.info_once("Disabling MoE shared_experts cuda stream") self.shared_experts_stream = None else: - # TODO(rob): enable shared expert overlap with non-cuda. - # aux_stream() returns None on non-cuda platforms. + # TODO(rob): enable shared expert overlap with non-cuda-alike. + # aux_stream() returns None on non-cuda-alike platforms. self.shared_experts_stream = aux_stream() if self.shared_experts_stream is not None: logger.info_once("Enabled separate cuda stream for MoE shared_experts") @@ -1865,6 +1865,11 @@ def forward_impl( hidden_states_combined, router_logits = get_ep_group().dispatch( hidden_states, router_logits, self.is_sequence_parallel ) + # Run shared experts before matrix multiply. + # because matrix multiply maybe modify the hidden_states. + if has_separate_shared_experts and not use_shared_experts_stream: + assert self.shared_experts is not None + shared_output = self.shared_experts(hidden_states) # Matrix multiply. final_hidden_states = self.quant_method.apply( @@ -1908,8 +1913,6 @@ def forward_impl( # conflict with the main stream shared_output = self.shared_experts(hidden_states_clone) current_stream().wait_stream(self.shared_experts_stream) - else: - shared_output = self.shared_experts(hidden_states) final_hidden_states = ( shared_output, diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 7c094e14cff7..3661dfd09047 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -426,8 +426,7 @@ def aux_stream() -> torch.cuda.Stream | None: from vllm.platforms import current_platform - # TODO: validate this works properly on ROCm platform. - if _aux_stream is None and current_platform.is_cuda(): + if _aux_stream is None and current_platform.is_cuda_alike(): _aux_stream = torch.cuda.Stream() return _aux_stream From ac10fd3c6900228e3c0a8fae20d039668c132446 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:59:30 -0800 Subject: [PATCH 200/578] Upstreaming aiter triton attention backend as a new backend (#28701) Signed-off-by: Aleksandr Malyshev Co-authored-by: Aleksandr Malyshev --- vllm/attention/backends/registry.py | 3 + vllm/platforms/rocm.py | 4 +- .../backends/mla/aiter_triton_mla.py | 74 +++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 vllm/v1/attention/backends/mla/aiter_triton_mla.py diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 51899b023591..91e1cad01f4f 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -46,6 +46,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend" ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" + ROCM_AITER_TRITON_MLA = ( + "vllm.v1.attention.backends.mla.aiter_triton_mla.AiterTritonMLABackend" + ) ROCM_AITER_FA = ( "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" ) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index bb116792fed5..f07f068a9249 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -234,7 +234,6 @@ def get_attn_backend_cls( if rocm_aiter_ops.is_mla_enabled() or block_size == 1 else AttentionBackendEnum.TRITON_MLA ) - if selected_backend == AttentionBackendEnum.TRITON_MLA: if block_size != 1: logger.info_once("Using Triton MLA backend.") @@ -246,6 +245,9 @@ def get_attn_backend_cls( if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA: logger.info("Using AITER MLA backend.") return AttentionBackendEnum.ROCM_AITER_MLA.get_path() + if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA: + logger.info("Using AITER TRITON MLA backend.") + return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path() raise ValueError( f" The selected backend, {selected_backend.name}," diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py new file mode 100644 index 000000000000..8a92152a0ca5 --- /dev/null +++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.v1.attention.backends.mla.common import MLACommonBackend +from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( + AiterMLAImpl, + AiterMLAMetadataBuilder, +) + + +class AiterTritonMLABackend(MLACommonBackend): + @staticmethod + def get_name() -> str: + return "AITER_TRITON_MLA" + + @staticmethod + def get_impl_cls() -> type["AiterTritonMLAImpl"]: + return AiterTritonMLAImpl + + @staticmethod + def get_builder_cls() -> type["AiterMLAMetadataBuilder"]: + return AiterMLAMetadataBuilder + + +class AiterTritonMLAImpl(AiterMLAImpl): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + **mla_args, + ) -> None: + super().__init__( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **mla_args, + ) + from aiter.ops.triton.mha import flash_attn_varlen_func + + self.flash_attn_varlen_func = flash_attn_varlen_func + + def _flash_attn_varlen_diff_headdims( + self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs + ): + result = self.flash_attn_varlen_func( + q, + k, + v, + softmax_scale=softmax_scale, + return_lse=return_softmax_lse, + **kwargs, + ) + # Transpose the LSE if Triton MHA is used: + # (q.shape[0], num_q_heads) to (num_q_heads, q.shape[0]) + if type(result) is tuple and return_softmax_lse: + output, lse = result + lse = lse.T.contiguous() + return (output, lse) + return result From 02f5903b84cfdf0b7cb31d46e995e3d4b9ad9e53 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Wed, 19 Nov 2025 12:01:05 -0800 Subject: [PATCH 201/578] Eagle: MM Cuda Graphs with MRope (#28896) Signed-off-by: Izzy Putterman Co-authored-by: Cyrus Leung --- vllm/model_executor/models/llama_eagle3.py | 14 ++++++-------- vllm/v1/spec_decode/eagle.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 75c671311b49..3eaf2d80082f 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -23,7 +23,6 @@ maybe_remap_kv_scale_name, ) from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors from .utils import ( @@ -121,13 +120,12 @@ def forward( @support_torch_compile( - # torch.compile is disabled for multimodal EAGLE3 models due to constraint - # violations with dynamic shapes during tensor concatenation operations. - # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132 - # Non-multimodal EAGLE3 models can still use torch.compile safely. - enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs( - vllm_config.model_config - ), + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "hidden_states": 0, + "input_embeds": 0, + } ) class LlamaModel(nn.Module): def __init__( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5bf2503c3027..406bb696bd4c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -116,9 +116,18 @@ def __init__( ) self.uses_mrope = self.vllm_config.model_config.uses_mrope if self.uses_mrope: - # M-RoPE need (3, max_num_tokens) + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 self.mrope_positions = torch.zeros( - (3, self.max_num_tokens), dtype=torch.int64, device=device + (3, self.max_num_tokens + 1), dtype=torch.int64, device=device ) else: # RoPE need (max_num_tokens,) From 2fd893b4cec0975a2a8430077fd9b4f294eb3561 Mon Sep 17 00:00:00 2001 From: Qiu Date: Thu, 20 Nov 2025 04:52:44 +0800 Subject: [PATCH 202/578] [Feature] Prefill Context Parallel (PCP) basic support (#28718) Signed-off-by: QiuChunshuo Signed-off-by: FENP Signed-off-by: LookAround Signed-off-by: Jingchun Gao Signed-off-by: zhenwenqi2024 Co-authored-by: FENP Co-authored-by: LookAround Co-authored-by: Jingchun Gao Co-authored-by: zhenwenqi2024 Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com> --- tests/distributed/test_context_parallel.py | 12 +-- .../moe/modular_kernel_tools/common.py | 7 +- tests/v1/worker/test_gpu_model_runner.py | 4 +- vllm/attention/backends/abstract.py | 17 +++++ vllm/attention/ops/common.py | 40 +++++++++- vllm/config/parallel.py | 40 +++++++--- vllm/config/vllm.py | 32 ++++++-- vllm/distributed/parallel_state.py | 74 +++++++++++++++---- vllm/engine/arg_utils.py | 22 ++++++ .../model_executor/layers/fused_moe/config.py | 59 ++++++++++----- vllm/model_executor/layers/fused_moe/layer.py | 32 ++++++++ vllm/model_executor/models/gpt_oss.py | 9 ++- vllm/v1/attention/backends/flash_attn.py | 6 +- vllm/v1/attention/backends/mla/common.py | 6 +- vllm/v1/attention/backends/utils.py | 18 ++--- vllm/v1/core/kv_cache_coordinator.py | 17 +++++ vllm/v1/core/kv_cache_manager.py | 9 +-- vllm/v1/core/kv_cache_utils.py | 13 +++- vllm/v1/core/sched/scheduler.py | 2 + vllm/v1/core/single_type_kv_cache_manager.py | 19 ++++- vllm/v1/engine/core.py | 1 + vllm/v1/executor/multiproc_executor.py | 23 ++++-- vllm/v1/kv_cache_interface.py | 5 +- vllm/v1/worker/block_table.py | 35 +++++---- vllm/v1/worker/gpu_input_batch.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/v1/worker/gpu_worker.py | 3 + 27 files changed, 399 insertions(+), 114 deletions(-) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index b16fd0d06b14..7e4713b8aece 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple): tp_size: int pp_size: int dcp_size: int - dcp_kv_cache_interleave_size: int + cp_kv_cache_interleave_size: int eager_mode: bool chunked_prefill: bool @@ -55,7 +55,7 @@ def detailed( tp_base: int = 4, pp_base: int = 1, dcp_base: int = 1, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", load_format: str | None = None, @@ -71,7 +71,7 @@ def detailed( tp_size=tp_base, pp_size=pp_multiplier * pp_base, dcp_size=int(dcp_multiplier * tp_base), - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val, ) @@ -116,7 +116,7 @@ def _compare_cp_with_tp( tp_size, pp_size, dcp_size, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, eager_mode, chunked_prefill, ) = parallel_setup @@ -197,7 +197,7 @@ def _compare_cp_with_tp( "--decode-context-parallel-size", str(dcp_size), "--dcp-kv-cache-interleave-size", - str(dcp_kv_cache_interleave_size), + str(cp_kv_cache_interleave_size), "--distributed-executor-backend", distributed_backend, ] @@ -227,7 +227,7 @@ def _compare_cp_with_tp( "deepseek-ai/DeepSeek-V2-Lite-Chat": [ CPTestSettings.detailed(), CPTestSettings.detailed(tp_base=2), - CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64), + CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64), ], "bigcode/gpt_bigcode-santacoder": [ CPTestSettings.detailed(), diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 1d925dc1bea8..d95c22fdf0a5 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -15,7 +15,11 @@ ) from tests.kernels.utils import torch_experts from vllm.config import VllmConfig -from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size +from vllm.distributed import ( + get_dp_group, + get_pcp_group, + get_tensor_model_parallel_world_size, +) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -561,6 +565,7 @@ def next_power_of_2(x): # make moe config moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( tp_size_=get_tensor_model_parallel_world_size(), + pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, vllm_parallel_config=vllm_config.parallel_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index b95c8df3469b..824e45897835 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -956,7 +956,7 @@ def test_hybrid_block_table_initialization(): max_num_reqs = 10 max_num_blocks_per_req = 20 max_num_batched_tokens = 512 - dcp_kv_cache_interleave_size = 8 + cp_kv_cache_interleave_size = 8 block_table = BlockTable( block_size=block_size, @@ -966,7 +966,7 @@ def test_hybrid_block_table_initialization(): pin_memory=False, device=torch.device(DEVICE), kernel_block_size=kernel_block_sizes[0], - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, ) # Verify hybrid block configuration diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 9275d70fd86a..d28bc065852d 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -266,6 +266,12 @@ class AttentionImpl(ABC, Generic[T]): dcp_world_size: int dcp_rank: int + pcp_world_size: int + pcp_rank: int + + total_cp_world_size: int + total_cp_rank: int + def __new__(cls, *args, **kwargs): # use __new__ so that all subclasses will call this self = super().__new__(cls) @@ -278,6 +284,17 @@ def __new__(cls, *args, **kwargs): # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 + try: + from vllm.distributed.parallel_state import get_pcp_group + + self.pcp_world_size = get_pcp_group().world_size + self.pcp_rank = get_pcp_group().rank_in_group + except AssertionError: + self.pcp_world_size = 1 + self.pcp_rank = 0 + self.total_cp_world_size = self.pcp_world_size * self.dcp_world_size + self.total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank + self.need_to_return_lse_for_decode = ( self.dcp_world_size > 1 and self.can_return_lse_for_decode ) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 2cbb5c91cc3b..67c5f7dbba9c 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -169,12 +169,11 @@ def correct_attn_out( return out, lse -def cp_lse_ag_out_rs( +def _cp_lse_common( cp_attn_out: torch.Tensor, cp_attn_lse: torch.Tensor, cp_group: GroupCoordinator, - ctx: CPTritonContext = None, - return_lse=False, + ctx: CPTritonContext | None = None, ): """ cp_attn_out: [ B, H, D ] @@ -195,6 +194,22 @@ def cp_lse_ag_out_rs( cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + assert out.is_contiguous() + return out, lse + + +def cp_lse_ag_out_rs( + cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext | None = None, + return_lse: bool = False, +): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx) out = cp_group.reduce_scatter(out, dim=1) if return_lse: @@ -205,6 +220,25 @@ def cp_lse_ag_out_rs( return out +def cp_lse_ag_out_ar( + cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext | None = None, + return_lse: bool = False, +): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx) + out = cp_group.all_reduce(out) + + if return_lse: + return out, lse + return out + + @triton.jit def _pack_seq_kernel( x_ptr, # [N, D] diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 0f107a7a3ef8..4b0236d8de3f 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -71,6 +71,8 @@ class ParallelConfig: """Number of pipeline parallel groups.""" tensor_parallel_size: int = 1 """Number of tensor parallel groups.""" + prefill_context_parallel_size: int = 1 + """Number of prefill context parallel groups.""" data_parallel_size: int = 1 """Number of data parallel groups. MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.""" @@ -239,14 +241,25 @@ class is dynamically inherited by the worker class. This is used to inject needs to be divisible by dcp_size.""" dcp_kv_cache_interleave_size: int = 1 - """Interleave size of kv_cache storage while using dcp or cp > 1, - store interleave_size tokens on (d)cp i, - then store next interleave_size tokens on (d)cp i+1. - Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size. - Interleave_size=block_size: block-level align, first fill the block on first rank, - token is stored on rank i+1 block j after rank i block j is full. - Block_size should be greater than or equal to dcp_kv_cache_interleave_size. - Block_size should be divisible by dcp_kv_cache_interleave_size. + """ + Interleave size of kv_cache storage while using DCP. + dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size, + and will be deprecated when PCP is fully supported. + + """ + cp_kv_cache_interleave_size: int = 1 + """Interleave size of kv_cache storage while using DCP or PCP. + For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`, + and `total_cp_world_size = pcp_world_size * dcp_world_szie`. + store interleave_size tokens on total_cp_rank i, + then store next interleave_size tokens on taotal_cp_rank i+1. + Interleave_size=1: token-level alignment, where token `i` is stored on + total_cp_rank `i % total_cp_world_size`. + Interleave_size=block_size: block-level alignment, where tokens are + first populated to the preceding ranks. Tokens are then stored + in (rank i+1, block j) only after (rank i, block j) is fully occupied. + Block_size should be greater than or equal to cp_kv_cache_interleave_size. + Block_size should be divisible by cp_kv_cache_interleave_size. """ _api_process_count: int = Field(default=1, gt=0) @@ -311,6 +324,11 @@ def _validate_parallel_config(self) -> Self: "num_redundant_experts." ) + if self.prefill_context_parallel_size > 1: + raise ValueError( + "Prefill context parallelism is not fully supported. " + "Please set prefill_context_parallel_size to 1." + ) return self @property @@ -529,7 +547,11 @@ def __post_init__(self) -> None: ) # Continue with the rest of the initialization - self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size + self.world_size = ( + self.pipeline_parallel_size + * self.tensor_parallel_size + * self.prefill_context_parallel_size + ) if self.distributed_executor_backend == "external_launcher": logger.info("Using external launcher for distributed inference.") diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 672b004c4aa5..d64e315b4fe3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -481,6 +481,14 @@ def __post_init__(self): "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + # prefill context parallel do not support full cudagraphs + elif self.parallel_config.prefill_context_parallel_size > 1: + logger.warning_once( + "Prefill context parallel (PCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE elif self.model_config is not None: if self.model_config.pooler_config is not None: logger.warning_once( @@ -610,22 +618,34 @@ def __post_init__(self): # If DCP, ensure the block size is right. if self.parallel_config.decode_context_parallel_size > 1: + if self.parallel_config.dcp_kv_cache_interleave_size > 1 and ( + self.parallel_config.cp_kv_cache_interleave_size + != self.parallel_config.dcp_kv_cache_interleave_size + ): + self.parallel_config.cp_kv_cache_interleave_size = ( + self.parallel_config.dcp_kv_cache_interleave_size + ) + logger.warning_once( + "cp_kv_cache_interleave_size is overridden by dcp_kv_cache" + "_interleave_size. And dcp-kv-cache-interleave-size will be " + "deprecated when PCP is fully supported." + ) assert ( - self.parallel_config.dcp_kv_cache_interleave_size + self.parallel_config.cp_kv_cache_interleave_size <= self.cache_config.block_size and self.cache_config.block_size - % self.parallel_config.dcp_kv_cache_interleave_size + % self.parallel_config.cp_kv_cache_interleave_size == 0 ), ( f"Block_size({self.cache_config.block_size}) should be greater " - "than or equal to and divisible by dcp_kv_cache_interleave_size " - f"({self.parallel_config.dcp_kv_cache_interleave_size})." + "than or equal to and divisible by cp_kv_cache_interleave_size " + f"({self.parallel_config.cp_kv_cache_interleave_size})." ) assert ( - self.parallel_config.dcp_kv_cache_interleave_size == 1 + self.parallel_config.cp_kv_cache_interleave_size == 1 or self.speculative_config is None - ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now." + ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." # Do this after all the updates to compilation_config.mode if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 852c4c644433..f81612fd1f4a 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1098,6 +1098,12 @@ def get_dcp_group() -> GroupCoordinator: _PP: GroupCoordinator | None = None + +def get_pp_group() -> GroupCoordinator: + assert _PP is not None, "pipeline model parallel group is not initialized" + return _PP + + _DP: GroupCoordinator | None = None @@ -1114,9 +1120,12 @@ def get_ep_group() -> GroupCoordinator: return _EP -def get_pp_group() -> GroupCoordinator: - assert _PP is not None, "pipeline model parallel group is not initialized" - return _PP +_PCP: GroupCoordinator | None = None + + +def get_pcp_group() -> GroupCoordinator: + assert _PCP is not None, "prefill context parallel group is not initialized" + return _PCP @deprecated( @@ -1276,6 +1285,7 @@ def init_distributed_environment( def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, + prefill_context_model_parallel_size: int = 1, decode_context_model_parallel_size: int | None = 1, backend: str | None = None, ) -> None: @@ -1325,7 +1335,11 @@ def initialize_model_parallel( # to get group_ranks for each dimension, transpose that dimension to the # last dimension, then reshape to 2D, then unbind the last dimension all_ranks = torch.arange(world_size).reshape( - -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size + -1, + data_parallel_size, + pipeline_model_parallel_size, + prefill_context_model_parallel_size, + tensor_model_parallel_size, ) # noqa # Build the tensor model-parallel groups. @@ -1360,11 +1374,23 @@ def initialize_model_parallel( group_name="dcp", ) + global _PCP + assert _PCP is None, "prefill context parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(3, 4) + .reshape(-1, prefill_context_model_parallel_size) + .unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _PCP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="pcp" + ) + # Build the pipeline model-parallel groups. global _PP assert _PP is None, "pipeline model parallel group is already initialized" group_ranks = ( - all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0) + all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0) ) group_ranks = [x.tolist() for x in group_ranks] _PP = init_model_parallel_group( @@ -1373,7 +1399,7 @@ def initialize_model_parallel( global _DP assert _DP is None, "data parallel group is already initialized" - group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0) + group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0) group_ranks = [x.tolist() for x in group_ranks] _DP = init_model_parallel_group( group_ranks, get_world_group().local_rank, backend, group_name="dp" @@ -1383,7 +1409,12 @@ def initialize_model_parallel( assert _EP is None, "expert parallel group is already initialized" group_ranks = ( all_ranks.transpose(1, 2) - .reshape(-1, data_parallel_size * tensor_model_parallel_size) + .reshape( + -1, + data_parallel_size + * prefill_context_model_parallel_size + * tensor_model_parallel_size, + ) .unbind(0) ) group_ranks = [x.tolist() for x in group_ranks] @@ -1393,11 +1424,13 @@ def initialize_model_parallel( logger.info_once( "rank %s in world size %s is assigned as " - "DP rank %s, PP rank %s, TP rank %s, EP rank %s", + "DP rank %s, PP rank %s, PCP rank %s, " + "TP rank %s, EP rank %s", rank, world_size, _DP.rank_in_group, _PP.rank_in_group, + _PCP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group, ) @@ -1406,6 +1439,7 @@ def initialize_model_parallel( def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, + prefill_context_model_parallel_size: int = 1, decode_context_model_parallel_size: int | None = 1, backend: str | None = None, ) -> None: @@ -1418,6 +1452,7 @@ def ensure_model_parallel_initialized( initialize_model_parallel( tensor_model_parallel_size, pipeline_model_parallel_size, + prefill_context_model_parallel_size, decode_context_model_parallel_size, backend, ) @@ -1434,6 +1469,12 @@ def ensure_model_parallel_initialized( f"got: {pp_world_size=} vs. " f"wanted: {pipeline_model_parallel_size=}" ) + pcp_world_size = get_pcp_group().world_size + assert pcp_world_size == prefill_context_model_parallel_size, ( + "prefill context parallel group already initialized, but of unexpected size: " + f"{pcp_world_size=} vs. " + f"{prefill_context_model_parallel_size=}" + ) def prepare_communication_buffer_for_model(model: torch.nn.Module): @@ -1445,6 +1486,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module): """ if _TP is not None: _TP.prepare_communication_buffer_for_model(model) + if _PCP is not None: + _PCP.prepare_communication_buffer_for_model(model) if _PP is not None: _PP.prepare_communication_buffer_for_model(model) if _DP is not None: @@ -1520,16 +1563,21 @@ def destroy_model_parallel(): _TP.destroy() _TP = None - global _PP - if _PP: - _PP.destroy() - _PP = None - global _DCP if _DCP: _DCP.destroy() _DCP = None + global _PCP + if _PCP: + _PCP.destroy() + _PCP = None + + global _PP + if _PP: + _PP.destroy() + _PP = None + global _DP if _DP: _DP.destroy() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e2f7326448b3..68205b6079d7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -389,8 +389,10 @@ class EngineArgs: nnodes: int = ParallelConfig.nnodes node_rank: int = ParallelConfig.node_rank tensor_parallel_size: int = ParallelConfig.tensor_parallel_size + prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size + cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size data_parallel_size: int = ParallelConfig.data_parallel_size data_parallel_rank: int | None = None data_parallel_start_rank: int | None = None @@ -770,6 +772,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--dcp-kv-cache-interleave-size", **parallel_kwargs["dcp_kv_cache_interleave_size"], ) + parallel_group.add_argument( + "--cp-kv-cache-interleave-size", + **parallel_kwargs["cp_kv_cache_interleave_size"], + ) + parallel_group.add_argument( + "--prefill-context-parallel-size", + "-pcp", + **parallel_kwargs["prefill_context_parallel_size"], + ) parallel_group.add_argument( "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"] ) @@ -1600,6 +1611,7 @@ def create_engine_config( parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, + prefill_context_parallel_size=self.prefill_context_parallel_size, data_parallel_size=self.data_parallel_size, data_parallel_rank=self.data_parallel_rank or 0, data_parallel_external_lb=data_parallel_external_lb, @@ -1631,6 +1643,7 @@ def create_engine_config( worker_extension_cls=self.worker_extension_cls, decode_context_parallel_size=self.decode_context_parallel_size, dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size, _api_process_count=self._api_process_count, _api_process_rank=self._api_process_rank, ) @@ -1952,6 +1965,15 @@ def _set_default_args( default_prefix_caching, ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + if self.prefill_context_parallel_size > 1: + default_chunked_prefill = False + default_prefix_caching = False + logger.warning( + "--prefill-context-parallel-size > 1 is not compatible with " + "chunked prefill and prefix caching now. Chunked prefill " + "and prefix caching have been disabled by default." + ) + if self.enable_chunked_prefill is None: self.enable_chunked_prefill = default_chunked_prefill diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index a7bd64b1c65e..21eb4d590a7d 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -8,7 +8,11 @@ import vllm.envs as envs from vllm.config import ParallelConfig -from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank +from vllm.distributed import ( + get_dp_group, + get_pcp_group, + get_tensor_model_parallel_rank, +) from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_DTYPES, @@ -684,9 +688,11 @@ def biased_moe_quant_config( @dataclass class FusedMoEParallelConfig: tp_size: int + pcp_size: int dp_size: int ep_size: int tp_rank: int + pcp_rank: int dp_rank: int ep_rank: int @@ -713,19 +719,22 @@ def use_deepep_ll_kernels(self): return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency" @staticmethod - def flatten_tp_across_dp( - tp_size: int, dp_size: int, dp_rank: int + def flatten_tp_across_dp_and_pcp( + tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int ) -> tuple[int, int]: tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank() - # There are actually dp_size * tp_size devices. Update tp_size - # and tp_rank so we shard across all devices. - flatten_tp_size = dp_size * tp_size - flatten_tp_rank = dp_rank * tp_size + tp_rank + # There are actually dp_size * pcp_size * tp_size devices. + # Update tp_size and tp_rank so we shard across all devices. + flatten_tp_size = dp_size * pcp_size * tp_size + flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank return flatten_tp_size, flatten_tp_rank @staticmethod def make( - tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig + tp_size_: int, + pcp_size_: int, + dp_size_: int, + vllm_parallel_config: ParallelConfig, ) -> "FusedMoEParallelConfig": """ Determine MoE parallel configuration. Based on the input `tp_size_`, @@ -734,19 +743,22 @@ def make( Args: tp_size_ (int): `tp_size` passed into the FusedMoE constructor. + pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor. dp_size_ (int): `dp_size` passed into the FusedMoE constructor. vllm_parallel_config (ParallelConfig): vLLM's parallel config object which contains the `enable_expert_parallel` flag. Examples: When there is no parallelism requested, - i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes + i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes unaltered and the ranks set to 0. - Expert Parallelism is considered only when either `dp_size_` or + Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or `tp_size_` is non trivial. - When TP = 2, DP = 1 and EP = False, the configuration on different + Note that PCP serves the same function as DP here. + + When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different devices: - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // @@ -754,7 +766,7 @@ def make( - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0} - Comment : Tensors are sharded across 2 devices. - When TP = 1, DP = 2 and EP = False, the configuration on different + When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different devices: - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0} @@ -762,7 +774,7 @@ def make( - Comment: There are 2 engine instances and the tensors are sharded across 2 decvices. - When TP = 2, DP = 2 and EP = False, the configuration on different + When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different devices: - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0} @@ -772,14 +784,14 @@ def make( - Comment: There are 2 engine instances and the tensors are sharded across 4 devices. - When, TP = 2, DP = 1 and EP = True, the configuration on different + When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0} - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1} - Comment: The experts are split between the 2 devices. - When, TP = 1, DP = 2 and EP = True, the configuration on different + When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0} @@ -787,7 +799,7 @@ def make( - Comment: There are 2 engine instances and the experts are split between the 2 devices. - When TP = 2, DP = 2 and EP = True, the configuration on different + When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0} @@ -798,18 +810,25 @@ def make( between the 4 devices. """ - use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel + use_ep = ( + dp_size_ * pcp_size_ * tp_size_ > 1 + and vllm_parallel_config.enable_expert_parallel + ) dp_size = dp_size_ dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0 - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( - tp_size_, dp_size_, dp_rank + pcp_size = pcp_size_ + pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0 + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( + tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank ) if not use_ep: return FusedMoEParallelConfig( tp_size=tp_size, tp_rank=tp_rank, + pcp_size=pcp_size, + pcp_rank=pcp_rank, dp_size=dp_size, dp_rank=dp_rank, ep_size=1, @@ -826,6 +845,8 @@ def make( return FusedMoEParallelConfig( tp_size=1, tp_rank=0, + pcp_size=pcp_size, + pcp_rank=pcp_rank, dp_size=dp_size, dp_rank=dp_rank, ep_size=ep_size, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8e9bba344287..7b15e63e9e35 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -18,6 +18,7 @@ from vllm.distributed import ( get_dp_group, get_ep_group, + get_pcp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) @@ -343,6 +344,7 @@ def __init__( tp_size: int | None = None, ep_size: int | None = None, dp_size: int | None = None, + pcp_size: int | None = None, prefix: str = "", custom_routing_function: Callable | None = None, scoring_func: str = "softmax", @@ -398,12 +400,14 @@ def __init__( tp_size if tp_size is not None else get_tensor_model_parallel_world_size() ) dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size + pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size self.is_sequence_parallel = is_sequence_parallel self.sp_size = tp_size_ if is_sequence_parallel else 1 self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( tp_size_=tp_size_, + pcp_size_=pcp_size_, dp_size_=dp_size_, vllm_parallel_config=vllm_config.parallel_config, ) @@ -679,6 +683,10 @@ def tp_size(self): def dp_size(self): return self.moe_parallel_config.dp_size + @property + def pcp_size(self): + return self.moe_parallel_config.pcp_size + @property def ep_size(self): return self.moe_parallel_config.ep_size @@ -691,6 +699,10 @@ def tp_rank(self): def dp_rank(self): return self.moe_parallel_config.dp_rank + @property + def pcp_rank(self): + return self.moe_parallel_config.pcp_rank + @property def ep_rank(self): return self.moe_parallel_config.ep_rank @@ -1871,6 +1883,19 @@ def forward_impl( assert self.shared_experts is not None shared_output = self.shared_experts(hidden_states) + # NOTE: Similar with DP, PCP also needs dispatch and combine. For + # simplicity, AgRsAll2All was added separately for PCP here. Maybe + # we should modify All2AllManager abstract to better support PCP. + if self.pcp_size > 1: + hidden_states = get_pcp_group().all_gather( + hidden_states, + dim=0, + ) + router_logits = get_pcp_group().all_gather( + router_logits, + dim=0, + ) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1925,6 +1950,13 @@ def forward_impl( def combine_output(states: torch.Tensor) -> torch.Tensor: if do_naive_dispatch_combine: states = get_ep_group().combine(states, self.is_sequence_parallel) + + if self.pcp_size > 1: + states = get_pcp_group().reduce_scatter( + states, + dim=0, + ) + return states if self.shared_experts is not None: diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index f310f71af92d..25048330f797 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -13,6 +13,7 @@ from vllm.distributed import ( get_dp_group, get_ep_group, + get_pcp_group, get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -322,10 +323,12 @@ def _load_weights_mxfp4( # In MoE, we need to flatten the tensor parallel size across the data # parallel size when EP is disabled. - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( tp_size=get_tensor_model_parallel_world_size(), dp_size=get_dp_group().world_size, dp_rank=get_dp_group().rank_in_group, + pcp_size=get_pcp_group().world_size, + pcp_rank=get_pcp_group().rank_in_group, ) intermediate_size = self.config.intermediate_size @@ -507,10 +510,12 @@ def _load_weights_other( # In MoE, we need to flatten the tensor parallel size across the data # parallel size when EP is disabled. - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( tp_size=get_tensor_model_parallel_world_size(), dp_size=get_dp_group().world_size, dp_rank=get_dp_group().rank_in_group, + pcp_size=get_pcp_group().world_size, + pcp_rank=get_pcp_group().rank_in_group, ) intermediate_size = self.config.intermediate_size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fdc99a0df1c8..cf3c1d05f5b3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -265,8 +265,8 @@ def __init__( self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_kv_cache_interleave_size = ( - self.parallel_config.dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size = ( + self.parallel_config.cp_kv_cache_interleave_size ) self.use_full_cuda_graph = ( @@ -388,7 +388,7 @@ def schedule( dcp_context_kv_lens_cpu, self.dcp_world_size, self.dcp_rank, - self.dcp_kv_cache_interleave_size, + self.cp_kv_cache_interleave_size, ) dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device) max_dcp_context_kv_len = dcp_context_kv_lens.max().item() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index e328049b53c7..32f406980f2e 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -536,7 +536,7 @@ def __init__( # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size + self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size # Don't try to access the runner on AMD @@ -1289,8 +1289,8 @@ def __init__(self, *args, **kwargs) -> None: get_current_vllm_config() ) ) - self.dcp_kv_cache_interleave_size: int = ( - get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size: int = ( + get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size ) def _flash_attn_varlen_diff_headdims( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 0dd189633129..540a8e2b1d01 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1080,9 +1080,9 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor): def get_dcp_local_seq_lens( seq_lens: torch.Tensor, - dcp_world_size: int = 1, + dcp_size: int = 1, dcp_rank: int | None = None, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ) -> torch.Tensor: """While using dcp, kv_cache size stored on each rank may be different, use this function to calculate split decode seq_lens of each dcp rank. @@ -1091,7 +1091,7 @@ def get_dcp_local_seq_lens( num_requests = seq_lens.size(0) if dcp_rank is None: rank_offsets = ( - torch.arange(dcp_world_size, dtype=torch.int32) + torch.arange(dcp_size, dtype=torch.int32) .unsqueeze(0) .repeat(num_requests, 1) ) @@ -1102,15 +1102,15 @@ def get_dcp_local_seq_lens( ) base = ( seq_lens_tiled - // dcp_kv_cache_interleave_size - // dcp_world_size - * dcp_kv_cache_interleave_size + // cp_kv_cache_interleave_size + // dcp_size + * cp_kv_cache_interleave_size ) - remainder = seq_lens_tiled - base * dcp_world_size + remainder = seq_lens_tiled - base * dcp_size remainder = torch.clip( - remainder - rank_offsets * dcp_kv_cache_interleave_size, + remainder - rank_offsets * cp_kv_cache_interleave_size, 0, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, ) dcp_local_seq_lens = base + remainder return dcp_local_seq_lens.squeeze(1) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 137e5e0cdb6d..1531b61f88fe 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -27,6 +27,7 @@ def __init__( enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): self.kv_cache_config = kv_cache_config self.max_model_len = max_model_len @@ -44,6 +45,7 @@ def __init__( block_pool=self.block_pool, kv_cache_group_id=i, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups) ) @@ -210,6 +212,7 @@ def __init__( use_eagle: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -218,6 +221,7 @@ def __init__( False, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.num_single_type_manager = len(self.single_type_managers) @@ -250,6 +254,7 @@ def __init__( enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -258,12 +263,16 @@ def __init__( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size self.dcp_world_size = dcp_world_size + self.pcp_world_size = pcp_world_size if dcp_world_size > 1: self.block_size *= dcp_world_size + if pcp_world_size > 1: + self.block_size *= pcp_world_size assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "UnitaryKVCacheCoordinator assumes only one kv cache group" ) @@ -281,6 +290,7 @@ def find_longest_cache_hit( kv_cache_spec=self.kv_cache_spec, use_eagle=self.use_eagle, dcp_world_size=self.dcp_world_size, + pcp_world_size=self.pcp_world_size, ) return hit_blocks, len(hit_blocks[0]) * self.block_size @@ -302,6 +312,7 @@ def __init__( enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -310,8 +321,10 @@ def __init__( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) assert dcp_world_size == 1, "DCP not support hybrid attn now." + assert pcp_world_size == 1, "PCP not support hybrid attn now." self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: @@ -452,6 +465,7 @@ def get_kv_cache_coordinator( enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ) -> KVCacheCoordinator: if not enable_caching: return KVCacheCoordinatorNoPrefixCache( @@ -460,6 +474,7 @@ def get_kv_cache_coordinator( use_eagle, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) if len(kv_cache_config.kv_cache_groups) == 1: return UnitaryKVCacheCoordinator( @@ -469,6 +484,7 @@ def get_kv_cache_coordinator( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) return HybridKVCacheCoordinator( kv_cache_config, @@ -477,4 +493,5 @@ def get_kv_cache_coordinator( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7f405fc248ac..2012c3fef88b 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -100,6 +100,7 @@ def __init__( log_stats: bool = False, enable_kv_cache_events: bool = False, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> None: self.max_model_len = max_model_len @@ -124,12 +125,9 @@ def __init__( 0 ].kv_cache_spec.block_size - if dcp_world_size > 1: + if dcp_world_size * pcp_world_size > 1: assert len(kv_cache_config.kv_cache_groups) == 1 - # Note(hc): need revisit. When both DCP and any future - # PCP are enabled, the block_size may need to be scaled - # by a factor of dcp_size × pcp_size? - self.block_size *= dcp_world_size + self.block_size *= dcp_world_size * pcp_world_size self.coordinator = get_kv_cache_coordinator( kv_cache_config=kv_cache_config, @@ -138,6 +136,7 @@ def __init__( enable_caching=self.enable_caching, enable_kv_cache_events=enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6e026215d402..01ecd881115d 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1219,11 +1219,16 @@ def _report_kv_cache_config( // len(kv_cache_config.kv_cache_groups) * min_block_size ) - if vllm_config.parallel_config.decode_context_parallel_size > 1: - num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + dcp_size = vllm_config.parallel_config.decode_context_parallel_size + pcp_size = vllm_config.parallel_config.prefill_context_parallel_size + if pcp_size * dcp_size > 1: + num_tokens *= pcp_size * dcp_size logger.info( - "Multiplying the GPU KV cache size by the dcp_world_size %d.", - vllm_config.parallel_config.decode_context_parallel_size, + "Multiplying the GPU KV cache size by the cp_world_size %d " + "(pcp_world_size %d * dcp_world_size %d).", + pcp_size * dcp_size, + pcp_size, + dcp_size, ) num_tokens_str = f"{num_tokens:,}" logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local") diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4323141c435b..4cc4c29591cc 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -121,6 +121,7 @@ def __init__( self.block_size = block_size self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size + self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size # req_id -> Request self.requests: dict[str, Request] = {} @@ -183,6 +184,7 @@ def __init__( log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, dcp_world_size=self.dcp_world_size, + pcp_world_size=self.pcp_world_size, ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 14ac83028ee4..d90ec550f766 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -32,6 +32,7 @@ def __init__( block_pool: BlockPool, kv_cache_group_id: int, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> None: """ Initializes the SingleTypeKVCacheManager. @@ -42,8 +43,9 @@ def __init__( """ self.block_size = kv_cache_spec.block_size self.dcp_world_size = dcp_world_size - if self.dcp_world_size > 1: - self.block_size *= dcp_world_size + self.pcp_world_size = pcp_world_size + if dcp_world_size * pcp_world_size > 1: + self.block_size *= dcp_world_size * pcp_world_size self.kv_cache_spec = kv_cache_spec self.block_pool = block_pool @@ -212,6 +214,7 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ Get the longest cache hit prefix of the blocks that is not longer than @@ -303,6 +306,7 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance( kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec) @@ -314,8 +318,8 @@ def find_longest_cache_hit( [] for _ in range(len(kv_cache_group_ids)) ) block_size = kv_cache_spec.block_size - if dcp_world_size > 1: - block_size *= dcp_world_size + if dcp_world_size * pcp_world_size > 1: + block_size *= dcp_world_size * pcp_world_size max_num_blocks = max_length // block_size for block_hash in itertools.islice(block_hashes, max_num_blocks): # block_hashes is a chain of block hashes. If a block hash is not @@ -362,11 +366,13 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, SlidingWindowSpec), ( "SlidingWindowManager can only be used for sliding window groups" ) assert dcp_world_size == 1, "DCP not support sliding window attn now." + assert pcp_world_size == 1, "PCP not support sliding window attn now." # The number of contiguous blocks needed for prefix cache hit. # -1 since the input token itself is also included in the window @@ -476,6 +482,7 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ For chunked local attention, we need to find the longest cache hit @@ -516,6 +523,7 @@ def find_longest_cache_hit( "Hybrid KV cache is not supported for " + "eagle + chunked local attention." ) assert dcp_world_size == 1, "DCP not support chunked local attn now." + assert pcp_world_size == 1, "PCP not support chunked local attn now." max_num_blocks = max_length // kv_cache_spec.block_size if max_length > 0: local_attention_start_idx = ( @@ -611,11 +619,13 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, MambaSpec), ( "MambaManager can only be used for mamba groups" ) assert dcp_world_size == 1, "DCP not support mamba now." + assert pcp_world_size == 1, "PCP not support mamba now." computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( [] for _ in range(len(kv_cache_group_ids)) ) @@ -705,6 +715,7 @@ def find_longest_cache_hit( kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, CrossAttentionSpec), ( "CrossAttentionManager can only be used for cross-attention groups" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3a25827cec38..6be19894d332 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -128,6 +128,7 @@ def __init__( scheduler_block_size = ( vllm_config.cache_config.block_size * vllm_config.parallel_config.decode_context_parallel_size + * vllm_config.parallel_config.prefill_context_parallel_size ) self.scheduler: SchedulerInterface = Scheduler( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ad2ece50f981..7e8ebe25c460 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -35,6 +35,7 @@ get_dp_group, get_ep_group, get_inner_dp_world_group, + get_pcp_group, get_pp_group, get_tp_group, ) @@ -110,12 +111,14 @@ def _init_executor(self) -> None: f"({self.parallel_config.nnodes_within_dp}). " ) self.local_world_size = self.parallel_config.local_world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - pp_parallel_size = self.parallel_config.pipeline_parallel_size - assert self.world_size == tensor_parallel_size * pp_parallel_size, ( + tp_size = self.parallel_config.tensor_parallel_size + pp_size = self.parallel_config.pipeline_parallel_size + pcp_size = self.parallel_config.prefill_context_parallel_size + assert self.world_size == tp_size * pp_size * pcp_size, ( f"world_size ({self.world_size}) must be equal to the " - f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" - f"_parallel_size ({pp_parallel_size}). " + f"tensor_parallel_size ({tp_size}) x pipeline" + f"_parallel_size ({pp_size}) x prefill_context" + f"_parallel_size ({pcp_size}). " ) # Set multiprocessing envs @@ -424,7 +427,11 @@ def _get_output_rank(self) -> int: # 16-23, PP rank 2 # 24-31, PP rank 3 # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3) - return self.world_size - self.parallel_config.tensor_parallel_size + return ( + self.world_size + - self.parallel_config.tensor_parallel_size + * self.parallel_config.prefill_context_parallel_size + ) @dataclass @@ -828,6 +835,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None: dp_rank = get_dp_group().rank_in_group pp_size = get_pp_group().world_size pp_rank = get_pp_group().rank_in_group + pcp_size = get_pcp_group().world_size + pcp_rank = get_pcp_group().rank_in_group tp_size = get_tp_group().world_size tp_rank = get_tp_group().rank_in_group dcp_size = get_dcp_group().world_size @@ -837,6 +846,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None: process_name += f"_DP{dp_rank}" if pp_size > 1: process_name += f"_PP{pp_rank}" + if pcp_size > 1: + process_name += f"_PCP{pcp_rank}" if tp_size > 1: process_name += f"_TP{tp_rank}" if dcp_size > 1: diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 7f33eb7e699c..751862aa9c76 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -95,10 +95,11 @@ class FullAttentionSpec(AttentionSpec): def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size + pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size # Note(hc): each dcp rank only need save # (max_model_len//dcp_world_size) tokens locally. - if dcp_world_size > 1: - max_model_len = cdiv(max_model_len, dcp_world_size) + if dcp_world_size * pcp_world_size > 1: + max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size) return cdiv(max_model_len, self.block_size) * self.page_size_bytes @classmethod diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 9f6c19e46430..76e17f3797a1 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,7 +4,7 @@ import numpy as np import torch -from vllm.distributed import get_dcp_group +from vllm.distributed import get_dcp_group, get_pcp_group from vllm.logger import init_logger from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer @@ -22,7 +22,7 @@ def __init__( pin_memory: bool, device: torch.device, kernel_block_size: int, - dcp_kv_cache_interleave_size: int, + cp_kv_cache_interleave_size: int, ): """ Args: @@ -80,6 +80,13 @@ def __init__( else: self._kernel_block_arange = None + try: + self.pcp_world_size = get_pcp_group().world_size + self.pcp_rank = get_pcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.pcp_world_size = 1 + self.pcp_rank = 0 try: self.dcp_world_size = get_dcp_group().world_size self.dcp_rank = get_dcp_group().rank_in_group @@ -87,7 +94,7 @@ def __init__( # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size def append_row( self, @@ -131,14 +138,16 @@ def compute_slot_mapping( # NOTE(woosuk): We can't simply use `token_indices // block_size` # here because M (max_model_len) is not necessarily divisible by # block_size. - if self.dcp_world_size > 1: + total_cp_world_size = self.pcp_world_size * self.dcp_world_size + total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank + if total_cp_world_size > 1: # Note(hc): The DCP implement store kvcache with an interleave # style, the kvcache for the token whose token_idx is i is # always stored on the GPU whose dcp_rank equals i % cp_world_size: # Use a "virtual block" which equals to world_size * block_size # for block_table_indices calculation. - virtual_block_size = self.block_size * self.dcp_world_size + virtual_block_size = self.block_size * total_cp_world_size block_table_indices = ( req_indices * self.max_num_blocks_per_req + positions // virtual_block_size @@ -150,16 +159,16 @@ def compute_slot_mapping( virtual_block_offsets = positions % virtual_block_size mask = ( virtual_block_offsets - // self.dcp_kv_cache_interleave_size - % self.dcp_world_size - == self.dcp_rank + // self.cp_kv_cache_interleave_size + % total_cp_world_size + == total_cp_rank ) # Calculate local block_offsets block_offsets = ( virtual_block_offsets - // (self.dcp_world_size * self.dcp_kv_cache_interleave_size) - * self.dcp_kv_cache_interleave_size - + virtual_block_offsets % self.dcp_kv_cache_interleave_size + // (total_cp_world_size * self.cp_kv_cache_interleave_size) + * self.cp_kv_cache_interleave_size + + virtual_block_offsets % self.cp_kv_cache_interleave_size ) # Calculate slot_mapping slot_mapping = block_numbers * self.block_size + block_offsets @@ -253,7 +262,7 @@ def __init__( block_sizes: list[int], kernel_block_sizes: list[int], num_speculative_tokens: int = 0, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ) -> None: # Note(hc): each dcp rank only store # (max_model_len//dcp_world_size) tokens in kvcache, @@ -283,7 +292,7 @@ def __init__( pin_memory, device, kernel_block_size, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, ) for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes) ] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index c1bfe727d86e..7b4bc1d2a224 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -87,7 +87,7 @@ def __init__( is_spec_decode: bool = False, is_pooling_model: bool = False, num_speculative_tokens: int = 0, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ): self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode @@ -141,7 +141,7 @@ def __init__( block_sizes=block_sizes, kernel_block_sizes=kernel_block_sizes, num_speculative_tokens=num_speculative_tokens, - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0c35f1330e9f..80f8344d4410 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -426,7 +426,7 @@ def __init__( # uses output token ids so we set this conservatively. logitsprocs_need_output_token_ids=bool(custom_logitsprocs), is_pooling_model=self.is_pooling_model, - dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size, ) self.use_async_scheduling = self.scheduler_config.async_scheduling @@ -1436,7 +1436,7 @@ def _build_attention_metadata( self.seq_lens.cpu[:num_reqs], self.dcp_world_size, self.dcp_rank, - self.parallel_config.dcp_kv_cache_interleave_size, + self.parallel_config.cp_kv_cache_interleave_size, ) self.dcp_local_seq_lens.copy_to_gpu(num_reqs) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 315f01b68499..b8339fc4dc8b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -26,6 +26,7 @@ has_kv_transfer_group, ) from vllm.distributed.parallel_state import ( + get_pcp_group, get_pp_group, get_tp_group, ) @@ -733,6 +734,7 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int): module.global_num_experts = module.moe_config.num_experts module.moe_parallel_config = FusedMoEParallelConfig.make( tp_size_=get_tp_group().world_size, + pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, vllm_parallel_config=parallel_config, ) @@ -886,6 +888,7 @@ def init_worker_distributed_environment( ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, + parallel_config.prefill_context_parallel_size, parallel_config.decode_context_parallel_size, ) From 68d7231991cc307d6865eac5bfca551c06f67465 Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Wed, 19 Nov 2025 15:04:36 -0600 Subject: [PATCH 203/578] [CI/Build] Fix test_prefix_prefill for AMD (#28905) Signed-off-by: Ryan Rock --- tests/kernels/attention/test_prefix_prefill.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 78cdbbbf7379..e041e8c8d2ff 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -174,11 +174,11 @@ def test_contexted_kv_attention( block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) b_seq_len = torch.tensor(seq_lens, dtype=torch.int32) b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32) max_input_len = MAX_SEQ_LEN # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0 + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to( + torch.int32 ) for i in range(BS): for j in range(query_lens[i]): @@ -417,11 +417,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) b_seq_len = torch.tensor(seq_lens, dtype=torch.int32) b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32) max_input_len = MAX_SEQ_LEN # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0 + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to( + torch.int32 ) for i in range(BS): for j in range(query_lens[i]): From 1607e664f0de4b7eb113c0259b889edbe73c4341 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:18:32 -0500 Subject: [PATCH 204/578] [Bug] Fix Batch Invariant MLA test (#28967) Signed-off-by: yewentao256 --- tests/v1/determinism/test_batch_invariance.py | 41 +++++++++++++++---- vllm/model_executor/layers/batch_invariant.py | 2 +- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index f018ee551dbf..d4e88891512c 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -9,13 +9,33 @@ from utils import _extract_step_logprobs, _random_prompt, skip_unsupported from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + +BACKENDS: list[str] = [ + "FLASH_ATTN", + "FLASHINFER", +] + +if current_platform.is_cuda() and current_platform.is_device_capability(90): + BACKENDS.append("FLASH_ATTN_MLA") + +DEFAULT_MODEL = "Qwen/Qwen3-1.7B" +MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" + + +def resolve_model_name(backend: str) -> str: + """Resolve the model name for the given backend, respecting env overrides.""" + model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) + if backend.endswith("MLA") and model == DEFAULT_MODEL: + return MLA_MODEL + return model @skip_unsupported @pytest.mark.timeout(1000) @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( backend, monkeypatch: pytest.MonkeyPatch @@ -47,7 +67,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # Allow overrides from environment (useful for CI tuning) # "facebook/opt-125m" is too small, doesn't reliably test determinism - model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model = resolve_model_name(backend) num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5")) max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128")) min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024")) @@ -150,7 +170,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) @pytest.mark.forked def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( @@ -160,7 +180,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) # For batch invariance, disable custom all-reduce to ensure deterministic @@ -369,7 +389,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): """ @@ -377,7 +397,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): Useful for quick smoke testing and debugging. """ monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model = resolve_model_name(backend) llm = LLM( model=model, @@ -419,7 +439,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) @pytest.mark.forked def test_logprobs_without_batch_invariance_should_fail( @@ -434,6 +454,9 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ + from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + + vllm_is_batch_invariant.cache_clear() monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test @@ -441,7 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) print(f"\n{'=' * 80}") @@ -659,7 +682,7 @@ def test_decode_logprobs_match_prefill_logprobs( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) from vllm.model_executor.layers.batch_invariant import ( diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 7920d117de5e..5dbeb2917434 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -803,11 +803,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", - "FLASHINFER_MLA", "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance + # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 ] if curr_attn_backend not in supported_backends: warning = ( From cdeec2e6067613c501f82463d54e420097f49750 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:20:58 +0100 Subject: [PATCH 205/578] [BugFix] Ray with multiple nodes (#28873) Signed-off-by: Julien Denize --- vllm/v1/worker/gpu_worker.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index b8339fc4dc8b..7f9cdd221224 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -205,14 +205,14 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = ( - torch.cuda.device_count() if torch.cuda.is_available() else 0 - ) - assert self.parallel_config.local_world_size <= visible_device_count, ( - f"local_world_size ({self.parallel_config.local_world_size}) must be " - f"less than or equal to the number of visible devices " - f"({visible_device_count})." - ) + visible_device_count = ( + torch.cuda.device_count() if torch.cuda.is_available() else 0 + ) + assert self.parallel_config.local_world_size <= visible_device_count, ( + f"local_world_size ({self.parallel_config.local_world_size}) must " + f"be less than or equal to the number of visible devices " + f"({visible_device_count})." + ) self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) From 613abb50d5715ba693ee9d5b727e8385b98e7185 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Wed, 19 Nov 2025 15:29:06 -0600 Subject: [PATCH 206/578] [MoE] Nvfp4 Masked Gemm: Add flashinfer grouped_gemm_nt_masked (#25990) Signed-off-by: Shu Wang. Signed-off-by: mgoin Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 1 + tests/kernels/moe/test_cutedsl_moe.py | 582 ++++++++++++++++++ vllm/envs.py | 8 +- .../fused_moe/deepep_ll_prepare_finalize.py | 16 +- .../fused_moe/flashinfer_cutedsl_moe.py | 346 +++++++++++ .../layers/quantization/modelopt.py | 30 +- .../quantization/utils/flashinfer_fp4_moe.py | 43 +- .../quantization/utils/flashinfer_utils.py | 21 +- .../quantization/utils/nvfp4_moe_support.py | 6 +- vllm/utils/flashinfer.py | 42 ++ 10 files changed, 1062 insertions(+), 33 deletions(-) create mode 100644 tests/kernels/moe/test_cutedsl_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 98daebcc0693..5309581d8e81 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -921,6 +921,7 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py new file mode 100644 index 000000000000..af1a34d17d48 --- /dev/null +++ b/tests/kernels/moe/test_cutedsl_moe.py @@ -0,0 +1,582 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from vllm.platforms import current_platform + +if not current_platform.has_device_capability(100): + pytest.skip( + reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True, + ) + +import torch +from flashinfer import fp4_quantize +from torch.nn import functional as F + +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( + flashinfer_cutedsl_moe_masked, +) +from vllm.utils.flashinfer import ( + flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked, +) +from vllm.utils.flashinfer import ( + scaled_fp4_grouped_quantize, +) + +kE2M1ToFloat = torch.tensor( + [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32 +) + +FLOAT8_E4M3_MAX = 448.0 +FLOAT4_E2M1_MAX = 6.0 + + +def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size): + m_tiles = (m + 128 - 1) // 128 + f = block_size * 4 + k_tiles = (k + f - 1) // f + tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4)) + tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5)) + out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size) + return out[0:m, 0:k] + + +def dequantize_nvfp4_to_dtype( + tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16 +): + """Dequantize the fp4 tensor back to high precision.""" + # Two fp4 values are packed into one uint8. + assert tensor_fp4.dtype == torch.uint8 + m, packed_k = tensor_fp4.shape + k = packed_k * 2 + tensor_f32 = break_fp4_bytes(tensor_fp4, dtype) + tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size) + tensor_sf = tensor_sf.view(torch.float8_e4m3fn) + tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size) + tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale + + # scale the tensor + out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k) + return out.to(dtype=dtype) + + +def break_fp4_bytes(a, dtype): + assert a.dtype == torch.uint8 + m, n = a.shape + + # Vectorized nibble processing + a_flat = a.flatten() + high = (a_flat & 0xF0) >> 4 # Upper nibbles + low = a_flat & 0x0F # Lower nibbles + + # Combine nibbles for batch processing + combined = torch.stack((low, high), dim=1).flatten() + + # Vectorized sign and magnitude extraction + signs = (combined & 0x08).to(torch.bool) # Sign bits + abs_vals = (combined & 0x07).to(torch.long) # Magnitude indices + + # Device-aware lookup and sign application + kE2M1 = kE2M1ToFloat.to(device=a.device) + values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0) + + # Reshape to final form + return values.reshape(m, n * 2).to(dtype=dtype) + + +def generate_balanced_routing( + hidden_states: torch.Tensor, num_experts: int, top_k: int +): + """ + Generate routing weights and topk indices such that every expert is active. + Returns routing_weights, topk_idx + """ + + num_tokens, hidden_dim = hidden_states.shape + # num_tokens = batch_size * seq_len + + # First, assign at least one token per expert + tokens_per_expert = torch.arange(num_tokens) % num_experts + tokens_per_expert = tokens_per_expert[torch.randperm(num_tokens)] # shuffle + + # Each token has top_k experts — start with one guaranteed expert + topk_idx = torch.full((num_tokens, top_k), -1, dtype=torch.long) + topk_idx[:, 0] = tokens_per_expert + + # For remaining top_k - 1 experts, pick randomly (allowing repeats) + if top_k > 1: + random_choices = torch.randint(0, num_experts, (num_tokens, top_k - 1)) + topk_idx[:, 1:] = random_choices + + # Normalize routing weights so each token's weights sum to 1 + routing_weights = torch.rand(num_tokens, top_k) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + # Reshape back if needed + routing_weights = routing_weights.view(num_tokens, top_k) + topk_idx = topk_idx.view(num_tokens, top_k) + + return routing_weights, topk_idx + + +def prepare_inputs( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + num_experts: int, + topk: int, +): + routing_weights, topk_idx = generate_balanced_routing( + router_logits, num_experts, topk + ) + + masked_m = [] + for i in range(num_experts): + mask = topk_idx.view(-1) == i + masked_m.append(mask.sum()) + + masked_m = torch.tensor(masked_m, dtype=torch.int32) + # Intialize the hidden_states_3d with ones instead of empty to avoid nan + # issue. + hidden_states_3d = torch.ones( + (num_experts, max(masked_m), hidden_states.shape[1]), dtype=hidden_states.dtype + ) + for i in range(num_experts): + hidden_states_3d[i, : masked_m[i], :] = hidden_states[topk_idx.view(-1) == i] + + return hidden_states_3d, masked_m, topk_idx, routing_weights + + +MNK_FACTORS = [ + (2, 1024, 1024), + (2, 1024, 1536), + (2, 3072, 1024), + (2, 3072, 1536), + (64, 1024, 1024), + (64, 1024, 1536), + (64, 3072, 1024), + (64, 2048, 1024), + (224, 1024, 1024), + (224, 1024, 1536), +] + + +# Reference implementation of torch_moe +def torch_moe(a, w1, w2, score, topk, expert_map): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + if expert_map is not None: + topk_ids = expert_map[topk_ids] + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose( + 0, 1 + ) + return ( + out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype) + ).sum(dim=1) + + +def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + m = w1[i].shape[0] + assert m % 2 == 0 + # Note: w1 and w3 are swapped! + w3_expert, w1_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :] + inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t()) + inter_gs = torch.tensor(1.0).cuda() + inter_q, inter_blockscale = fp4_quantize(inter, inter_gs) + inter = dequantize_nvfp4_to_dtype( + inter_q, + inter_blockscale, + inter_gs, + dtype=inter.dtype, + device=inter.device, + block_size=16, + ).cuda() + out[mask] = inter @ w2[i].transpose(0, 1) + return ( + out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype) + ).sum(dim=1) + + +def grouped_gemm_ref( + hidden_states_expanded: torch.Tensor, + hidden_states_3d: torch.Tensor, + weights: torch.Tensor, + topk_idx: torch.Tensor, + masked_m: torch.Tensor, + B: int, + topk: int, + num_experts: int, + *, + block_size: int = 16, +) -> torch.Tensor: + """ + Computes the reference grouped GEMM (fp4 quantized per-expert loop), + computes flashinfer grouped GEMM (for scale consistency), + and returns ONLY the repacked reference output: out_ref. + + Returns: + out_ref: Tensor [num_experts, max_m, n_out] + """ + device_hs = hidden_states_expanded.device + device_w = weights.device + out_dtype = weights.dtype + n_out = weights.shape[1] + + # Flattened reference output (B*topk, n_out) + out = torch.zeros((B * topk, n_out), dtype=out_dtype, device=device_w) + + # Per-expert reference compute loop + for i in range(num_experts): + mask = topk_idx.view(-1) == i + if mask.any(): + lhs = hidden_states_expanded[mask] + rhs = weights[i] + + a_amax = lhs.abs().max().to(torch.float32).to(device_hs) + b_amax = rhs.abs().max().to(torch.float32).to(device_w) + + a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax + b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax + + lhsq, lhsq_sf = fp4_quantize(lhs, a_gs) + rhsq, rhsq_sf = fp4_quantize(rhs, b_gs) + + lhs_in_dtype = dequantize_nvfp4_to_dtype( + lhsq, + lhsq_sf, + a_gs, + dtype=lhs.dtype, + device=device_hs, + block_size=block_size, + ) + rhs_in_dtype = dequantize_nvfp4_to_dtype( + rhsq, + rhsq_sf, + b_gs, + dtype=rhs.dtype, + device=device_w, + block_size=block_size, + ) + + out[mask] = lhs_in_dtype @ rhs_in_dtype.t() + + # Determine per-expert max_m + max_m_val = int(masked_m.max().item()) + + # Repack into [num_experts, max_m, n_out] + out_ref = torch.zeros( + (num_experts, max_m_val, n_out), + dtype=out.dtype, + device=out.device, + ) + expert_slot = [0] * num_experts + + for i, expert_id in enumerate(topk_idx.view(-1).tolist()): + slot = expert_slot[expert_id] + if slot < max_m_val: + out_ref[expert_id, slot, :] = out[i] + expert_slot[expert_id] += 1 + else: + raise IndexError( + f"Expert {expert_id} exceeded max slots ({max_m_val}). " + "Increase max_m or check masked_m." + ) + + return out_ref + + +def flashinfer_cutedsl_grouped_gemm_nt_masked( + hidden_states: torch.Tensor, # 3d + input_global_scale: torch.Tensor, # (l,) + weights: torch.Tensor, + w_global_scale: torch.Tensor, # (l,) + masked_m: torch.Tensor, +): + # hidden_states: [l, m, k] + # weights: [l, n, k] + aq, aq_sf = scaled_fp4_grouped_quantize( + hidden_states, + masked_m.to(hidden_states.device), + input_global_scale, + ) + num_experts, n, k = weights.shape + bq, bq_sf = scaled_fp4_grouped_quantize( + weights, + torch.full((num_experts,), n, device=weights.device, dtype=torch.int32), + w_global_scale, + ) + + out = torch.zeros( + (num_experts, max(masked_m), n), dtype=weights.dtype, device=aq.device + ) + out = out.permute(1, 2, 0) # requirement of kernel + sf_vec_size = 16 + ab_dtype = "float4_e2m1fn" + sf_dtype = "float8_e4m3fn" + c_dtype = "bfloat16" + alpha = 1.0 / (input_global_scale * w_global_scale).to(out.dtype).view( + 1, 1, num_experts + ) + + def get_cute_dtype(input: torch.Tensor) -> str: + if input.dtype == torch.bfloat16: + return "bfloat16" + elif input.dtype == torch.float16: + return "float16" + elif input.dtype == torch.float32: + return "float32" + else: + raise ValueError(f"Unsupported cute dtype {input.dtype}") + + cutedsl_gmm_masked( + (aq, aq_sf), + (bq, bq_sf), + out, + masked_m.to(aq.device), + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=alpha, + alpha_dtype=get_cute_dtype(alpha), + ) + + return out + + +@pytest.mark.parametrize("bs, hidden_dim, inter_dim", [(2, 128, 256), (16, 128, 512)]) +@pytest.mark.parametrize("topk", [1, 2, 4]) +@torch.inference_mode() +def test_flashinfer_cutedsl_moe_masked( + bs: int, hidden_dim: int, inter_dim: int, topk: int +): + torch.manual_seed(42) + device = "cuda" + num_experts = 8 + hidden_states = ( + torch.randn(bs, hidden_dim, dtype=torch.bfloat16, device=device) / 5.0 + ) + w1 = ( + torch.randn( + num_experts, 2 * inter_dim, hidden_dim, dtype=torch.bfloat16, device=device + ) + / 10.0 + ) + w2 = ( + torch.randn( + num_experts, hidden_dim, inter_dim, dtype=torch.bfloat16, device=device + ) + / 10.0 + ) + router_logits = torch.randn(bs, num_experts, dtype=torch.float32) + + hidden_states_expanded = ( + hidden_states.view(bs, -1, hidden_dim) + .repeat(1, topk, 1) + .reshape(-1, hidden_dim) + ) + hidden_states_3d, masked_m, topk_idx, routing_weights = prepare_inputs( + hidden_states_expanded, router_logits, num_experts, topk + ) + + w1_amax = w1.abs().amax(dim=(1, 2)).to(torch.float32).to(w1.device) + w2_amax = w2.abs().amax(dim=(1, 2)).to(torch.float32).to(w2.device) + input_global_scale = torch.ones( + (num_experts,), dtype=torch.float32, device=hidden_states.device + ) + + w1_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax + w2_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax + a2_global_scale = torch.ones( + (num_experts,), dtype=torch.float32, device=hidden_states.device + ) # assume intermediate scale is 1.0 + + w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize( + w1, + torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim, + w1_global_scale, + ) + w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize( + w2, + torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim, + w2_global_scale, + ) + + w1_alpha = 1.0 / (input_global_scale * w1_global_scale) + w2_alpha = 1.0 / (a2_global_scale * w2_global_scale) + + out = torch.empty_like(hidden_states_3d) + # Note: the 1st dim shouldn't be bs + wk = torch.empty( + num_experts, + hidden_states_3d.shape[1], + inter_dim * 2, + dtype=hidden_states_3d.dtype, + device=hidden_states.device, + ) + flashinfer_cutedsl_moe_masked( + hidden_states_3d.to(hidden_states.device), + input_global_scale, + w1_fp4.permute(2, 0, 1), + w1_blockscale, + w1_alpha, + w2_fp4.permute(2, 0, 1), + a2_global_scale, + w2_blockscale, + w2_alpha, + masked_m.to(hidden_states.device), + wk, + out, + ) + + # reference + a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale) + a_in_dtype = dequantize_nvfp4_to_dtype( + a_fp4, + a_scale_interleaved, + input_global_scale, + dtype=hidden_states.dtype, + device=hidden_states.device, + block_size=16, + ) + w1_d = torch.empty( + (num_experts, 2 * inter_dim, hidden_dim), device=w1.device, dtype=w1.dtype + ) + w2_d = torch.empty( + (num_experts, hidden_dim, inter_dim), device=w2.device, dtype=w2.dtype + ) + + for idx in range(0, num_experts): + w1_fp4_sliced, w1_blockscale_sliced = fp4_quantize( + w1[idx], w1_global_scale[idx] + ) + w2_fp4_sliced, w2_blockscale_sliced = fp4_quantize( + w2[idx], w2_global_scale[idx] + ) + w1_d[idx] = dequantize_nvfp4_to_dtype( + w1_fp4_sliced, + w1_blockscale_sliced, + w1_global_scale[idx], + dtype=w1.dtype, + device=w1.device, + block_size=16, + ) + w2_d[idx] = dequantize_nvfp4_to_dtype( + w2_fp4_sliced, + w2_blockscale_sliced, + w2_global_scale[idx], + dtype=w2.dtype, + device=w2.device, + block_size=16, + ) + + ref_output = torch_moe_nvfp4( + a_in_dtype, + w1_d, + w2_d, + topk, + routing_weights.to(a_in_dtype.device), + topk_idx.to(a_in_dtype.device), + ) + out_weighted = torch.zeros_like(ref_output, device=out.device, dtype=out.dtype) + + positions = torch.nonzero(masked_m[topk_idx], as_tuple=False) + rows, cols = positions[:, 0], positions[:, 1] + experts = topk_idx[rows, cols] + for i in range(num_experts): + mask = experts == i + if mask.any(): + idx = torch.nonzero(mask, as_tuple=False).squeeze(-1) + r, c = rows[idx], cols[idx] + out_weighted[r] += out[i, : len(r), :] * routing_weights[r, c].to( + out.device + ).unsqueeze(-1) + torch.testing.assert_close( + out_weighted.cpu(), ref_output.cpu(), atol=2e-1, rtol=2e-1 + ) + + +@pytest.mark.parametrize( + "bs, hidden_dim, inter_dim, topk", [(2, 128, 256, 2), (16, 128, 512, 5)] +) +@torch.inference_mode() +def test_grouped_gemm_nt_masked( + bs: int, hidden_dim: int, inter_dim: int, topk: int +) -> None: + torch.manual_seed(42) + B = bs + D = hidden_dim + N = inter_dim + # CuteDSL group gemm has issue when not all experts are active. + # i.e. masked = [2, 3, 0, 0, 1] where the 2nd and 3rd experts are inactive + # see https://github.com/flashinfer-ai/flashinfer/issues/1856 + num_experts = bs + hidden_states = torch.randn(B, D, dtype=torch.bfloat16, device="cuda") + weights = torch.randn(num_experts, N, D, dtype=torch.bfloat16, device="cuda") + router_logits = torch.randn(B, num_experts, dtype=torch.float32) + + hidden_states_expanded = ( + hidden_states.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + ) + hidden_states_3d, masked_m, topk_idx, _ = prepare_inputs( + hidden_states_expanded, router_logits, num_experts, topk + ) + + a_amax = ( + hidden_states_3d.abs() + .amax(dim=(1, 2)) + .to(torch.float32) + .to(hidden_states.device) + ) + b_amax = weights.abs().amax(dim=(1, 2)).to(torch.float32).to(weights.device) + a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax + b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax + out_flashinfer = flashinfer_cutedsl_grouped_gemm_nt_masked( + hidden_states_3d.to(hidden_states.device), a_gs, weights, b_gs, masked_m + ) + # reference + out_ref = grouped_gemm_ref( + hidden_states_expanded=hidden_states_expanded, + hidden_states_3d=hidden_states_3d, + weights=weights, + topk_idx=topk_idx, + masked_m=masked_m, + B=B, + topk=topk, + num_experts=num_experts, + ) + # Note: just to compare the masked position due to cutedsl may write nan + # into unmasked position. + for i in range(num_experts): + torch.testing.assert_close( + out_flashinfer.permute(2, 0, 1)[i, : masked_m[i]], + out_ref.to(out_flashinfer.device)[i, : masked_m[i]], + atol=1e-1, + rtol=1e-1, + ) + + +if __name__ == "__main__": + test_flashinfer_cutedsl_moe_masked(16, 128, 512, 4) + test_grouped_gemm_nt_masked(16, 128, 512, 4) diff --git a/vllm/envs.py b/vllm/envs.py index 212d68114e46..1ff620af5722 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -157,7 +157,9 @@ VLLM_USE_FLASHINFER_MOE_FP16: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False - VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency" + VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = ( + "latency" + ) VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024 VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 @@ -1238,7 +1240,9 @@ def get_vllm_port() -> int | None: # - "latency": # Uses TensorRT-LLM kernels optimized for low-latency inference. "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices( - "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"] + "VLLM_FLASHINFER_MOE_BACKEND", + "latency", + ["throughput", "latency", "masked_gemm"], ), # Control the workspace buffer size for the FlashInfer backend. "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index e0db248958b4..fea9f49c04b8 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -6,6 +6,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import envs from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( @@ -27,6 +28,8 @@ DEEPEP_QUANT_BLOCK_SIZE = 128 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE] +logger = init_logger(__name__) + def dequant_fp8( expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor @@ -187,16 +190,25 @@ def _do_quant( # TODO (varun): Optimization - Use a batched version of quant x = x.view((-1, hidden_dim)) + q_dtype = quant_config.quant_dtype + + if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm": + logger.info_once( + "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) " + "for ModelOptNvFp4FusedMoE." + ) + q_dtype = None + x, x_scales = moe_kernel_quantize_input( x, quant_config.a1_scale, - quant_config.quant_dtype, + q_dtype, quant_config.per_act_token_quant, quant_config.block_shape, ) x = x.view((num_experts, -1, hidden_dim)) - if quant_config.quant_dtype is not None: + if q_dtype is not None: assert x_scales is not None x_scales = normalize_batched_scales_shape(x_scales, num_experts) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py new file mode 100644 index 000000000000..2747ef04a349 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py @@ -0,0 +1,346 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, +) +from vllm.utils.flashinfer import ( + flashinfer_cutedsl_grouped_gemm_nt_masked, + has_flashinfer_cutedsl_grouped_gemm_nt_masked, + scaled_fp4_grouped_quantize, + silu_and_mul_scaled_nvfp4_experts_quantize, +) + +logger = init_logger(__name__) + + +def is_valid_flashinfer_cutedsl_fused_moe( + hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor +) -> bool: + """ + Check if the given problem size is supported by the FlashInfer CuteDSL MoE + kernel. + """ + if not has_flashinfer_cutedsl_grouped_gemm_nt_masked(): + logger.debug_once( + "FlashInferCuteDSLExperts disabled: " + "flashinfer_cutedsl_fused_moe not available." + ) + return False + # Data type checks + if ( + w1.dtype != torch.uint8 + or w2.dtype != torch.uint8 + or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16] + ): + logger.debug_once( + "FlashInferCuteDSLExperts disabled: w1/w2 must be torch.uint8 " + f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be " + f"float32, float16, or bfloat16 (got {hidden_states.dtype})." + ) + return False + return True + + +class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(quant_config) + assert quant_config.quant_dtype == "nvfp4", ( + "Only nvfp4 quantization are currently supported." + ) + self.out_dtype = out_dtype + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_expert_map(self) -> bool: + return False + + def supports_chunking(self) -> bool: + # This refers to TP chunking; DP chunking is handled separately. + # TODO(shuw@nvidia.com): Set to False to be consistent with + # batched_deep_gemm_moe + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # We use global_num_experts due to how moe_align_block_size handles + # expert_maps. + """ + Compute the shapes for the temporary and final outputs of the two gemms + and activation in the fused expert function. Since the gemms are + independent, the workspace for the first gemm can be shared with the + workspace for the last gemm. + + Returns a tuple of: + - workspace13 shape tuple: must be large enough to hold the + result of either expert gemm. + - workspace2 shape tuple: must be large enough to hold the + result of the activation function. + - output shape tuple: must be exact size of the final gemm output. + - Workspace type: The dtype to use for the workspace tensors. + - Note: in order for activation chunking to work, the first dimension + of each tuple must be the number of tokens. + """ + output_shape = (local_num_experts, M, K) + workspace2 = (local_num_experts, M, N) + workspace1 = output_shape + return (workspace1, workspace2, output_shape) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, # Not used + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool | None, + ): + assert self.quant_dtype == "nvfp4", ( + "Only nvfp4 quantization are currently supported." + ) + # Ensure w1_scale and w2_scale are not None before calling view + assert self.w1_scale is not None and self.w2_scale is not None, ( + "w1_scale and w2_scale must not be None for FlashInferExperts" + ) + assert expert_tokens_meta is not None + expert_num_tokens = expert_tokens_meta.expert_num_tokens + assert hidden_states.ndim == 3 + assert self.w1_scale.ndim == 3 + assert self.w2_scale.ndim == 3 + flashinfer_cutedsl_moe_masked( + hidden_states=hidden_states, + input_global_scale=self.a1_gscale, + w1=w1, + w1_blockscale=self.w1_scale, + w1_alpha=self.g1_alphas, + w2=w2, + a2_global_scale=self.a2_gscale, + w2_blockscale=self.w2_scale, + w2_alpha=self.g2_alphas, + masked_m=expert_num_tokens, + workspace=workspace2, + out=output, + ) + + +def get_cute_dtype(input: torch.Tensor) -> str: + if input.dtype == torch.bfloat16: + return "bfloat16" + elif input.dtype == torch.float16: + return "float16" + elif input.dtype == torch.float32: + return "float32" + else: + raise ValueError(f"Unsupported cute dtype {input.dtype}") + + +def flashinfer_cutedsl_moe_masked( + hidden_states: torch.Tensor, + input_global_scale: torch.Tensor, + w1: torch.Tensor, + w1_blockscale: torch.Tensor, + w1_alpha, + w2: torch.Tensor, + a2_global_scale: torch.Tensor, + w2_blockscale: torch.Tensor, + w2_alpha, + masked_m: torch.Tensor, + workspace: torch.Tensor, + out: torch.Tensor, +): + """ + Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL + kernels. + + Args: + hidden_states (torch.Tensor): [num_experts, m, k], bf16 + input_global_scale (torch.Tensor): (l,) + w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8 + w1_blockscale (torch.Tensor): blockscale factors, e4m3, + w1_alpha (torch.Tensor): (l,) + w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8 + a2_global_scale (torch.Tensor): (l,) + w2_blockscale (torch.Tensor): blockscale factors, e4m3, + w2_alpha (torch.Tensor): (l,) + masked_m (torch.Tensor): Masked dimension indices + workspace (torch.Tensor): For gateup_output + + Notes: + - Assumes max(masked_m) <= m. + """ + + # === Assertions on dtypes === + assert input_global_scale.dtype == torch.float32, ( + f"input_global_scale must be float32, got {input_global_scale.dtype}" + ) + assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}" + assert w1_blockscale.dtype == torch.float8_e4m3fn, ( + f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}" + ) + assert w1_alpha.dtype == torch.float32, ( + f"w1_alpha must be float32, got {w1_alpha.dtype}" + ) + assert w2.dtype == torch.uint8, f"w2 must be uint8, got {w2.dtype}" + assert a2_global_scale.dtype == torch.float32, ( + f"a2_global_scale must be float32, got {a2_global_scale.dtype}" + ) + assert w2_blockscale.dtype == torch.float8_e4m3fn, ( + f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}" + ) + assert w2_alpha.dtype == torch.float32, ( + f"w2_alpha must be float32, got {w2_alpha.dtype}" + ) + + # === Assertions on shapes === + n = w2.shape[-1] * 2 # intermediate dimension + num_experts, m, k = hidden_states.shape + + assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}" + assert w1.shape[-1] * 2 == k, ( + f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}" + ) + assert w2.shape[-2:] == ( + k, + n // 2, + ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}" + + assert input_global_scale.shape == (num_experts,), ( + f"input_global_scale must be (l,), got {input_global_scale.shape}" + ) + assert w1_alpha.shape == (num_experts,), ( + f"w1_alpha must be (l,), got {w1_alpha.shape}" + ) + assert a2_global_scale.shape == (num_experts,), ( + f"a2_global_scale must be (l,), got {a2_global_scale.shape}" + ) + assert w2_alpha.shape == (num_experts,), ( + f"w2_alpha must be (l,), got {w2_alpha.shape}" + ) + + aq, aq_sf = scaled_fp4_grouped_quantize( + hidden_states, + masked_m, + input_global_scale, + ) + + workspace = workspace.permute(1, 2, 0) # requirement of kernel + sf_vec_size = 16 + assert aq_sf.dtype == torch.float8_e4m3fn + assert aq.dtype == torch.uint8 + ab_dtype = "float4_e2m1fn" + sf_dtype = "float8_e4m3fn" + + c_dtype = get_cute_dtype(hidden_states) + + # Gemm1 + flashinfer_cutedsl_grouped_gemm_nt_masked( + (aq, aq_sf), + (w1.permute(1, 2, 0), w1_blockscale), + workspace, + masked_m, + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=w1_alpha.view(1, 1, num_experts), + alpha_dtype=get_cute_dtype(w1_alpha), + ) # in logical [m, n, l] + + # SILU and quantization + diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize( + workspace.permute(2, 0, 1), + masked_m, + a2_global_scale, + ) + + # Gemm2 + out = out.permute(1, 2, 0) # requirement of kernel + flashinfer_cutedsl_grouped_gemm_nt_masked( + (diq, diq_sf), + (w2.permute(1, 2, 0), w2_blockscale), + out, + masked_m, + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=w2_alpha.view(1, 1, num_experts), + alpha_dtype=get_cute_dtype(w2_alpha), + ) # in logical [m, k, l] + out = out.permute(2, 0, 1) + + +def flashinfer_cutedsl_moe_fp4( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + create_flashinfer_prepare_finalize, + ) + + fused_experts = mk.FusedMoEModularKernel( + create_flashinfer_prepare_finalize(use_dp=False), # could be swapped later + FlashInferCuteDSLExperts( + out_dtype=hidden_states.dtype, + quant_config=quant_config, + ), + ) + + return fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 38ab7cd4f115..f684c17452a9 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1468,7 +1468,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: gemm1_weight = layer.w13_weight.data gemm1_weight_scale = layer.w13_weight_scale.data - if self.allow_flashinfer: + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + ): gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( gemm1_weight, gemm1_weight_scale, dim=-2 ) @@ -1746,17 +1749,26 @@ def apply( workspace=layer.workspace, ) - elif ( - self.allow_flashinfer - and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS - ): - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - flashinfer_cutlass_moe_fp4, + elif self.allow_flashinfer: + assert self.flashinfer_moe_backend in ( + FlashinferMoeBackend.CUTLASS, + FlashinferMoeBackend.CUTEDSL, ) + if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + flashinfer_cutlass_moe_fp4, + ) - assert self.moe_quant_config is not None + flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4 + else: + from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( # noqa: E501 + flashinfer_cutedsl_moe_fp4, + ) + + flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4 - return flashinfer_cutlass_moe_fp4( + assert self.moe_quant_config is not None + return flashinfer_fn_moe_fp4( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index fdf330329e20..36e8599dd948 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -10,6 +10,9 @@ FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( + FlashInferCuteDSLExperts, +) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( FlashInferExperts, ) @@ -17,10 +20,14 @@ create_flashinfer_prepare_finalize, ) from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.flashinfer import ( + has_flashinfer_cutedsl_grouped_gemm_nt_masked, + has_flashinfer_cutlass_fused_moe, +) __all__ = [ "is_flashinfer_fp4_cutlass_moe_available", + "is_flashinfer_fp4_cutedsl_moe_available", "reorder_w1w3_to_w3w1", "build_flashinfer_fp4_cutlass_moe_prepare_finalize", ] @@ -36,6 +43,16 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool: ) +def is_flashinfer_fp4_cutedsl_moe_available() -> bool: + """Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used.""" + return ( + envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutedsl_grouped_gemm_nt_masked() + and current_platform.is_cuda() + and current_platform.is_device_capability(100) + ) + + def reorder_w1w3_to_w3w1( weight: torch.Tensor, scale: torch.Tensor, dim: int = -2 ) -> tuple[torch.Tensor, torch.Tensor]: @@ -72,15 +89,21 @@ def select_nvfp4_gemm_impl( """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" if allow_flashinfer: - return FlashInferExperts( - out_dtype=moe.in_dtype, - quant_config=moe_quant_config, - ep_rank=moe.moe_parallel_config.ep_rank, - ep_size=moe.moe_parallel_config.ep_size, - tp_rank=moe.moe_parallel_config.tp_rank, - tp_size=moe.moe_parallel_config.tp_size, - use_dp=moe.moe_parallel_config.dp_size > 1, - ) + if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm": + return FlashInferCuteDSLExperts( + out_dtype=moe.in_dtype, + quant_config=moe_quant_config, + ) + elif envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput": + return FlashInferExperts( + out_dtype=moe.in_dtype, + quant_config=moe_quant_config, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + use_dp=moe.moe_parallel_config.dp_size > 1, + ) # native cutlass experts currently don't support DP; TP case won't call this raise ValueError( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index f22e17945d1f..7eba8359b92f 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -25,6 +25,7 @@ class FlashinferMoeBackend(Enum): TENSORRT_LLM = "TensorRT-LLM" CUTLASS = "CUTLASS" + CUTEDSL = "CUTEDSL" def calculate_tile_tokens_dim(num_tokens, top_k, num_experts): @@ -273,19 +274,21 @@ def flashinfer_cutlass_moe_fp8( def get_flashinfer_moe_backend() -> FlashinferMoeBackend: + backend_map = { + "throughput": FlashinferMoeBackend.CUTLASS, + "latency": FlashinferMoeBackend.TENSORRT_LLM, + "masked_gemm": FlashinferMoeBackend.CUTEDSL, + } + flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND - # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations - if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability( - 90 - ): + if flashinfer_moe_backend in backend_map: + return backend_map[flashinfer_moe_backend] + elif current_platform.is_device_capability(90): return FlashinferMoeBackend.CUTLASS - elif flashinfer_moe_backend == "latency": - return FlashinferMoeBackend.TENSORRT_LLM - allowed_backends = ["throughput", "latency"] raise ValueError( - f"Unknown flashinfer moe backend: {flashinfer_moe_backend}" - f" expected one of {allowed_backends}" + f"Unknown flashinfer moe backend: {flashinfer_moe_backend!r}. " + f"Expected one of {list(backend_map.keys())}." ) diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py index c3f26cc77411..44c5b027daf4 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -5,6 +5,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + is_flashinfer_fp4_cutedsl_moe_available, is_flashinfer_fp4_cutlass_moe_available, ) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( @@ -32,7 +33,10 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: """Detect platform support for NV-FP4 fused-MoE path""" cutlass_supported = cutlass_fp4_supported() - allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available() + allow_flashinfer = cutlass_supported and ( + is_flashinfer_fp4_cutlass_moe_available() + or is_flashinfer_fp4_cutedsl_moe_available() + ) if allow_flashinfer: _logger.info_once( diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1209d64901bf..9f9976d52b4a 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -114,7 +114,17 @@ def wrapper(*args, **kwargs): flashinfer_cutlass_fused_moe = _lazy_import_wrapper( "flashinfer.fused_moe", "cutlass_fused_moe" ) +flashinfer_cutedsl_grouped_gemm_nt_masked = _lazy_import_wrapper( + "flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked" +) flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") +nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize") +silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper( + "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize" +) +scaled_fp4_grouped_quantize = _lazy_import_wrapper( + "flashinfer", "scaled_fp4_grouped_quantize" +) nvfp4_block_scale_interleave = _lazy_import_wrapper( "flashinfer", "nvfp4_block_scale_interleave" ) @@ -166,6 +176,14 @@ def has_flashinfer_moe() -> bool: ) +@functools.cache +def has_flashinfer_cutedsl() -> bool: + """Return ``True`` if FlashInfer cutedsl module is available.""" + return ( + has_flashinfer() and importlib.util.find_spec("flashinfer.cute_dsl") is not None + ) + + @functools.cache def has_flashinfer_cutlass_fused_moe() -> bool: """Return `True` if FlashInfer CUTLASS fused MoE is available.""" @@ -187,6 +205,26 @@ def has_flashinfer_cutlass_fused_moe() -> bool: return True +@functools.cache +def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool: + """Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" + if not has_flashinfer_cutedsl(): + return False + + # Check if all required functions are available + required_functions = [ + ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"), + ("flashinfer", "scaled_fp4_grouped_quantize"), + ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"), + ] + + for module_name, attr_name in required_functions: + mod = _get_submodule(module_name) + if not mod or not hasattr(mod, attr_name): + return False + return True + + @functools.cache def has_nvidia_artifactory() -> bool: """Return `True` if NVIDIA's artifactory is accessible. @@ -472,7 +510,10 @@ def flashinfer_disable_q_quantization() -> bool: "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", "flashinfer_cutlass_fused_moe", + "flashinfer_cutedsl_grouped_gemm_nt_masked", "flashinfer_fp4_quantize", + "silu_and_mul_scaled_nvfp4_experts_quantize", + "scaled_fp4_grouped_quantize", "nvfp4_block_scale_interleave", "trtllm_fp4_block_scale_moe", "autotune", @@ -480,6 +521,7 @@ def flashinfer_disable_q_quantization() -> bool: "has_flashinfer_comm", "has_flashinfer_all2all", "has_flashinfer_cutlass_fused_moe", + "has_flashinfer_cutedsl_grouped_gemm_nt_masked", "has_nvidia_artifactory", "supports_trtllm_attention", "can_use_trtllm_attention", From 88f5b19f0bc681c016eaaa17502d3bb4e2b59b51 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Wed, 19 Nov 2025 16:30:04 -0500 Subject: [PATCH 207/578] [DeepSeek] Fix DeepSeek V3.2 Rope Embedding (#28968) Signed-off-by: Yongye Zhu --- vllm/model_executor/layers/mla.py | 6 +++++- vllm/model_executor/models/deepseek_v2.py | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index c4c44b83ae6b..6ebfa47a9dc3 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -24,6 +24,7 @@ class MLAModules: q_b_proj: torch.nn.Module | None q_proj: torch.nn.Module | None indexer: torch.nn.Module | None + indexer_rotary_emb: torch.nn.Module | None is_sparse: bool topk_indices_buffer: torch.Tensor | None @@ -80,6 +81,7 @@ def __init__( self.rotary_emb = mla_modules.rotary_emb self.o_proj = mla_modules.o_proj self.indexer = mla_modules.indexer + self.indexer_rope_emb = mla_modules.indexer_rotary_emb self.is_sparse = mla_modules.is_sparse if self.indexer is not None: @@ -153,7 +155,9 @@ def forward_native( ) if self.indexer and self.is_sparse: - _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb) + _topk_indices = self.indexer( + hidden_states, q_c, positions, self.indexer_rope_emb + ) attn_out = self.mla_attn( q, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6675b2133f38..c0ff621d8408 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -837,8 +837,8 @@ def forward( ) q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1)) - q = torch.cat([q_pe, q_nope], dim=-1) - k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1) + q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1) + k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1) # we only quant q here since k quant is fused with cache insertion q = q.view(-1, self.head_dim) @@ -987,6 +987,14 @@ def __init__( self.is_v32 = hasattr(config, "index_topk") if self.is_v32: + self.indexer_rope_emb = get_rope( + qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=True, + ) self.indexer = Indexer( vllm_config, config, @@ -998,6 +1006,7 @@ def __init__( f"{prefix}.indexer", ) else: + self.indexer_rope_emb = None self.indexer = None mla_modules = MLAModules( @@ -1015,6 +1024,7 @@ def __init__( q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None, q_proj=self.q_proj if self.q_lora_rank is None else None, indexer=self.indexer, + indexer_rotary_emb=self.indexer_rope_emb, is_sparse=self.is_v32, topk_indices_buffer=topk_indices_buffer, ) From 22e44ad589d951f440ef98141a2a6f9df97f6873 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 19 Nov 2025 15:31:33 -0600 Subject: [PATCH 208/578] [ROCm][CI] Fix Weight Loading With Multiple GPU Tests on ROCm (#28984) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 5 ++--- tests/weight_loading/models-amd.txt | 3 +++ tests/weight_loading/models-large-amd.txt | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 tests/weight_loading/models-amd.txt create mode 100644 tests/weight_loading/models-large-amd.txt diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0049f3540340..37c6bd427672 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1323,7 +1323,7 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] @@ -1331,13 +1331,12 @@ steps: # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 2 - gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental] diff --git a/tests/weight_loading/models-amd.txt b/tests/weight_loading/models-amd.txt new file mode 100644 index 000000000000..e31e904c08af --- /dev/null +++ b/tests/weight_loading/models-amd.txt @@ -0,0 +1,3 @@ +fp8, amd/Meta-Llama-3.1-8B-Instruct-FP8-KV, main +None, amd/Llama-3.2-1B-Instruct-FP8-KV, main +fp8, amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV, main diff --git a/tests/weight_loading/models-large-amd.txt b/tests/weight_loading/models-large-amd.txt new file mode 100644 index 000000000000..b6f5b4b16b37 --- /dev/null +++ b/tests/weight_loading/models-large-amd.txt @@ -0,0 +1,3 @@ +fp8, amd/Meta-Llama-3.1-70B-Instruct-FP8-KV, main +None, microsoft/phi-4, main +fp8, amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV, main From 8f4f77a7275ecac594f84bdb41b67c95cf3eb26d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 19 Nov 2025 16:43:54 -0500 Subject: [PATCH 209/578] [BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036) Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index ca01cb3fb55d..1c3ef502f0f4 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -921,7 +921,7 @@ def adjust_cudagraph_sizes_for_spec_decode( self, uniform_decode_query_len: int, tensor_parallel_size: int ): multiple_of = uniform_decode_query_len - if tensor_parallel_size > 1: + if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism: multiple_of = max(uniform_decode_query_len, tensor_parallel_size) if ( multiple_of % uniform_decode_query_len != 0 From cb0a7b4bea26657da989562a10055b7d0b59fd3a Mon Sep 17 00:00:00 2001 From: Max Hu Date: Wed, 19 Nov 2025 16:54:15 -0500 Subject: [PATCH 210/578] [Bugfix] Move flashinfer kernel check into ```__init__``` function of ```FusedMoE``` (#29018) Signed-off-by: Max Hu --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7b15e63e9e35..be1910266c87 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -574,6 +574,9 @@ def __init__( is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, ) + self.moe_config_use_flashinfer_cutlass_kernels = ( + self.moe_config.use_flashinfer_cutlass_kernels + ) self.quant_config = quant_config @@ -728,7 +731,7 @@ def use_flashinfer_cutlass_kernels(self): return ( self.moe_quant_config is not None and self.moe_quant_config.quant_dtype == "nvfp4" - and self.moe_config.use_flashinfer_cutlass_kernels + and self.moe_config_use_flashinfer_cutlass_kernels ) @property From 0075bfffd4201d1377f0d048848f82911e917639 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:22:43 -0500 Subject: [PATCH 211/578] [CI] Fix precommit `rope_theta` issue (#29040) Signed-off-by: yewentao256 --- vllm/model_executor/models/deepseek_v2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c0ff621d8408..c50fc327e760 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -991,8 +991,7 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.indexer = Indexer( From 8e38e998298364b0a94cddf7ccc59d8466c2396a Mon Sep 17 00:00:00 2001 From: JartX Date: Thu, 20 Nov 2025 00:30:08 +0100 Subject: [PATCH 212/578] [Feature] EPLB on Qwen3VLMoe and CompressedTensorsWNA16MoEMethod (#28849) --- .../compressed_tensors_moe.py | 27 +++++++- vllm/model_executor/models/qwen3_vl_moe.py | 62 +++++++++++++++++-- 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 22b3c477f420..fa254030a271 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1921,9 +1921,20 @@ def apply( logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet." - ) + if expert_load_view is None: + raise ValueError("enable_eplb=True requiere expert_load_view != None") + if logical_to_physical_map is None: + raise ValueError( + "enable_eplb=True requiere logical_to_physical_map != None" + ) + if logical_replica_count is None: + raise ValueError( + "enable_eplb=True requiere logical_replica_count != None" + ) + if not isinstance(layer, FusedMoE): + raise TypeError( + "EPLB is only supported when `layer` is a instance of FusedMoE." + ) from vllm.model_executor.layers.fused_moe import fused_experts @@ -1940,6 +1951,12 @@ def apply( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, + num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0), + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, ) return fused_experts( @@ -1956,6 +1973,10 @@ def apply( quant_config=self.moe_quant_config, ) + @property + def supports_eplb(self) -> bool: + return True + class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): """ diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 5c3205faf9c2..e2c129120b1a 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -15,7 +15,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -29,7 +29,9 @@ from itertools import islice import torch -from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig +from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import ( + Qwen3VLMoeConfig, +) from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig @@ -44,7 +46,12 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors -from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel +from .interfaces import MixtureOfExperts +from .qwen3_moe import ( + Qwen3MoeForCausalLM, + Qwen3MoeModel, + Qwen3MoeSparseMoeBlock, +) from .qwen3_vl import ( Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder, @@ -344,12 +351,56 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) +class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts): + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.language_model.model.layers: + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def set_moe_parameters(self): + self.expert_weights = [] + + self.moe_layers = [] + example_moe = None + for layer in self.language_model.model.layers: + if hasattr(layer, "mlp") and isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No Qwen3Moe layer found in the language_model.") + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + @MULTIMODAL_REGISTRY.register_processor( Qwen3VLMultiModalProcessor, info=Qwen3VLMoeProcessingInfo, dummy_inputs=Qwen3VLDummyInputsBuilder, ) -class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): +class Qwen3VLMoeForConditionalGeneration( + Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts +): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -413,3 +464,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.deepstack_input_embeds = None self.visual_dim = config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + + # Set MoE hyperparameters + self.set_moe_parameters() From 3aaa94ac99f4b295ba95f14b4968620b2127044f Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:47:13 -0500 Subject: [PATCH 213/578] [Performance] Reduce DeepGEMM N dim restriction from 128 to 64 multiplier (#28687) Signed-off-by: Alexander Matveev Signed-off-by: mgoin Co-authored-by: mgoin --- .buildkite/test-pipeline.yaml | 20 ++++++++++++++++++++ tests/kernels/quantization/test_block_fp8.py | 11 +++++++---- vllm/utils/deep_gemm.py | 11 +++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5309581d8e81..71249a9543c7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -550,6 +550,26 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + optional: true + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s tests/kernels/moe/test_deepgemm.py + - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py + - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index e9973c1fcc15..d0e4f6554a91 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -22,6 +22,7 @@ fp8_gemm_nt, get_col_major_tma_aligned_tensor, per_block_cast_to_fp8, + should_use_deepgemm_for_fp8_linear, ) from vllm.utils.import_utils import has_deep_gemm @@ -157,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul(): @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): - # only aligned sizes - if M % 4 != 0 or K % 128 != 0 or N % 64 != 0: - pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}") - torch.manual_seed(seed) fp8_info = torch.finfo(torch.float8_e4m3fn) fp8_max = fp8_info.max @@ -168,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + # only aligned sizes are supported by deepgemm + if not should_use_deepgemm_for_fp8_linear( + output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True + ): + pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}") + A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1]) B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index b5ab37534dd7..6b0a383a0e28 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -365,11 +365,18 @@ def should_use_deepgemm_for_fp8_linear( ): if supports_deep_gemm is None: supports_deep_gemm = is_deep_gemm_supported() + + # Verify DeepGEMM N/K dims requirements + # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul + # test inside kernels/quatization/test_block_fp8.py + N_MULTIPLE = 64 + K_MULTIPLE = 128 + return ( supports_deep_gemm and output_dtype == torch.bfloat16 - and weight.shape[0] % 128 == 0 - and weight.shape[1] % 128 == 0 + and weight.shape[0] % N_MULTIPLE == 0 + and weight.shape[1] % K_MULTIPLE == 0 ) From 5031cd5d55ad99e8f9b31dd0020a06b346f6e493 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:53:15 -0500 Subject: [PATCH 214/578] [Refactor] Optimize `select_experts` (#28069) Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/fused_moe.py | 5 ----- vllm/model_executor/layers/fused_moe/layer.py | 11 ++++------- vllm/model_executor/layers/quantization/modelopt.py | 2 +- vllm/model_executor/models/longcat_flash.py | 2 +- vllm/model_executor/models/openpangu.py | 2 +- 5 files changed, 7 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 2e042d85fcfc..f44328418f1b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1246,7 +1246,6 @@ def eplb_map_to_physical_and_record( expert_load_view: torch.Tensor, logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, - indices_type: torch.dtype | None = None, ) -> torch.Tensor: """ Map the logical expert ids to physical expert ids @@ -1260,7 +1259,6 @@ def eplb_map_to_physical_and_record( expert_load_view: The expert load view. logical_to_physical_map: The logical to physical map. logical_replica_count: The logical replica count. - indices_type: The indices type. Returns: The physical expert ids. @@ -1310,9 +1308,6 @@ def eplb_map_to_physical_and_record( index=topk_ids_flatten.long(), src=torch.ones_like(topk_ids_flatten).to(expert_load_view), ) - - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) return topk_ids diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index be1910266c87..d9525a7439c3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -68,7 +68,6 @@ def _eplb_map_to_physical_and_record( expert_load_view: torch.Tensor, logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, - indices_type: torch.dtype | None, ) -> torch.Tensor: # CPU fallback: no EPLB so just return as is return topk_ids @@ -1509,8 +1508,6 @@ def select_experts( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, ) - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) elif e_score_correction_bias is not None: topk_weights, topk_ids = fused_topk_bias( hidden_states=hidden_states, @@ -1519,7 +1516,7 @@ def select_experts( topk=top_k, renormalize=renormalize, ) - if routed_scaling_factor is not None: + if routed_scaling_factor != 1.0: topk_weights *= routed_scaling_factor elif custom_routing_function is None: topk_weights, topk_ids, token_expert_indices = fused_topk( @@ -1536,8 +1533,6 @@ def select_experts( topk=top_k, renormalize=renormalize, ) - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) if enable_eplb: assert expert_load_view is not None @@ -1549,9 +1544,11 @@ def select_experts( expert_load_view=expert_load_view, logical_to_physical_map=logical_to_physical_map, logical_replica_count=logical_replica_count, - indices_type=indices_type, ) + if (indices_type is not None) and topk_ids.dtype != indices_type: + topk_ids = topk_ids.to(dtype=indices_type) + assert topk_ids.dtype == indices_type or indices_type is None # Compute zero expert result if needed diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index f684c17452a9..dedab33c1bdb 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1706,7 +1706,7 @@ def apply( intermediate_size=layer.intermediate_size_per_partition, local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, - routed_scaling_factor=None, + routed_scaling_factor=1.0, tile_tokens_dim=None, routing_method_type=routing_method_type, do_finalize=True, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index fafe97cd2be7..c5441283f971 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -118,7 +118,7 @@ def __init__( router_dtype="float32", router_bias=False, topk_method=None, - routed_scaling_factor=None, + routed_scaling_factor=1.0, zero_expert_num=0, zero_expert_type=None, nextn_use_scmoe=False, diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index f814cdfec5a2..4124a181a14c 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -625,7 +625,7 @@ def __init__( bias=getattr(config, "mlp_bias", False), prefix=f"{prefix}.mlp", ) - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None) + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) self.num_hidden_layers = config.num_hidden_layers self.first_k_dense_replace = getattr( config, "first_k_dense_replace", self.num_hidden_layers From 537cc635c77ac63f643c5289137debdd8f9591ac Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 19 Nov 2025 16:10:22 -0800 Subject: [PATCH 215/578] [GC Debugger] Simply and improve GC Debugger Utils (#29029) Signed-off-by: Jialin Ouyang --- vllm/utils/gc_utils.py | 7 ++++--- vllm/v1/engine/core.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 160ac9ac263a..3436e450a269 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -68,9 +68,10 @@ def handle(self, phase: str, info: dict[str, int]) -> None: # Before GC started, record GC start time # and top collected objects self.start_time_ns = time.monotonic_ns() - self.gc_top_collected_objects = _compute_top_gc_collected_objects( - gc.get_objects(generation), self.config.top_objects - ) + if (top_objects := self.config.top_objects) > 0: + self.gc_top_collected_objects = _compute_top_gc_collected_objects( + gc.get_objects(generation), top_objects + ) elif phase == "stop": # After GC finished, Record GC elapsed time and # optionally top collected objects diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6be19894d332..8657a95b5e6e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -206,6 +206,8 @@ def __init__( # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. freeze_gc_heap() + # If enable, attach GC debugger after static variable freeze. + maybe_attach_gc_debug_callback() def _initialize_kv_caches( self, vllm_config: VllmConfig @@ -645,9 +647,6 @@ def __init__( assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - # If enable, attach GC debugger after static variable freeze. - maybe_attach_gc_debug_callback() - # Enable environment variable cache (e.g. assume no more # environment variable overrides after this point) enable_envs_cache() From 9ccef8e333ccd988a587990740405503e76c8c20 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 16:26:04 -0800 Subject: [PATCH 216/578] [Misc] Colorize logs (#29017) Signed-off-by: Nick Hill --- tests/test_logger.py | 94 ++++++++++++++++++--------------- vllm/envs.py | 9 ++++ vllm/logger.py | 51 ++++++++++++------ vllm/logging_utils/__init__.py | 3 +- vllm/logging_utils/formatter.py | 50 ++++++++++++++++++ vllm/utils/system_utils.py | 7 ++- 6 files changed, 152 insertions(+), 62 deletions(-) diff --git a/tests/test_logger.py b/tests/test_logger.py index 01672358902f..8900e9c2a1e6 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -49,10 +49,13 @@ def test_trace_function_call(): os.remove(path) -def test_default_vllm_root_logger_configuration(): +def test_default_vllm_root_logger_configuration(monkeypatch): """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default behavior is activated.""" + monkeypatch.setenv("VLLM_LOGGING_COLOR", "0") + _configure_vllm_root_logger() + logger = logging.getLogger("vllm") assert logger.level == logging.DEBUG assert not logger.propagate @@ -70,12 +73,13 @@ def test_default_vllm_root_logger_configuration(): assert formatter.datefmt == _DATE_FORMAT -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None) -def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(): +def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(monkeypatch): """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default behavior is activated.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False) + root_logger = logging.getLogger("vllm") root_handler = root_logger.handlers[0] @@ -99,49 +103,50 @@ def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(): assert log_record.levelno == logging.INFO -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0) -@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None) -def test_logger_configuring_can_be_disabled(): +def test_logger_configuring_can_be_disabled(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0") + monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False) with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_not_called() -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -@patch( - "vllm.logger.VLLM_LOGGING_CONFIG_PATH", - "/if/there/is/a/file/here/then/you/did/this/to/yourself.json", -) -def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(): +def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + monkeypatch.setenv( + "VLLM_LOGGING_CONFIG_PATH", + "/if/there/is/a/file/here/then/you/did/this/to/yourself.json", + ) + with pytest.raises(RuntimeError) as ex_info: _configure_vllm_root_logger() assert ex_info.type == RuntimeError # noqa: E721 assert "File does not exist" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(): +def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write("---\nloggers: []\nversion: 1") logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(JSONDecodeError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type == JSONDecodeError - assert "Expecting value" in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(JSONDecodeError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type == JSONDecodeError + assert "Expecting value" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) @pytest.mark.parametrize( "unexpected_config", ( @@ -151,26 +156,30 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(): ), ) def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( + monkeypatch, unexpected_config: Any, ): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(unexpected_config)) logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(ValueError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type == ValueError # noqa: E721 - assert "Invalid logging config. Expected dict, got" in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(ValueError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type == ValueError # noqa: E721 + assert "Invalid logging config. Expected dict, got" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -def test_custom_logging_config_is_parsed_and_used_when_provided(): +def test_custom_logging_config_is_parsed_and_used_when_provided(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + valid_logging_config = { "loggers": { "vllm.test_logger.logger": { @@ -183,19 +192,18 @@ def test_custom_logging_config_is_parsed_and_used_when_provided(): with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(valid_logging_config)) logging_config_file.flush() - with ( - patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name), - patch("vllm.logger.dictConfig") as dict_config_mock, - ): + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_called_with(valid_logging_config) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0) -def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): +def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0") + valid_logging_config = { "loggers": { "vllm.test_logger.logger": { @@ -207,15 +215,15 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(valid_logging_config)) logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(RuntimeError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type is RuntimeError - expected_message_snippet = ( - "VLLM_CONFIGURE_LOGGING evaluated to false, but " - "VLLM_LOGGING_CONFIG_PATH was given." - ) - assert expected_message_snippet in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(RuntimeError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type is RuntimeError + expected_message_snippet = ( + "VLLM_CONFIGURE_LOGGING evaluated to false, but " + "VLLM_LOGGING_CONFIG_PATH was given." + ) + assert expected_message_snippet in str(ex_info) # Remember! The root logger is assumed to have been configured as # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None. diff --git a/vllm/envs.py b/vllm/envs.py index 1ff620af5722..614bc94b978b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -42,6 +42,8 @@ VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_STREAM: str = "ext://sys.stdout" VLLM_LOGGING_CONFIG_PATH: str | None = None + VLLM_LOGGING_COLOR: str = "auto" + NO_COLOR: bool = False VLLM_LOG_STATS_INTERVAL: float = 10.0 VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: str | None = None @@ -616,6 +618,11 @@ def get_vllm_port() -> int | None: "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"), # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # Controls colored logging output. Options: "auto" (default, colors when terminal), + # "1" (always use colors), "0" (never use colors) + "VLLM_LOGGING_COLOR": lambda: os.getenv("VLLM_LOGGING_COLOR", "auto"), + # Standard unix flag for disabling ANSI color codes + "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0", # If set, vllm will log stats at this interval in seconds # If not set, vllm will log stats every 10 seconds. "VLLM_LOG_STATS_INTERVAL": lambda: val @@ -1578,6 +1585,7 @@ def compile_factors() -> dict[str, object]: "VLLM_LOGGING_PREFIX", "VLLM_LOGGING_STREAM", "VLLM_LOGGING_CONFIG_PATH", + "VLLM_LOGGING_COLOR", "VLLM_LOG_STATS_INTERVAL", "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "VLLM_TUNED_CONFIG_FOLDER", @@ -1608,6 +1616,7 @@ def compile_factors() -> dict[str, object]: "VLLM_TEST_FORCE_LOAD_FORMAT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES", + "NO_COLOR", } from vllm.config.utils import normalize_value diff --git a/vllm/logger.py b/vllm/logger.py index 934100829684..772e36497b45 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -17,18 +17,25 @@ import vllm.envs as envs -VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING -VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH -VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL -VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX -VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM - _FORMAT = ( - f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + f"{envs.VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " "[%(fileinfo)s:%(lineno)d] %(message)s" ) _DATE_FORMAT = "%m-%d %H:%M:%S" + +def _use_color() -> bool: + if envs.NO_COLOR or envs.VLLM_LOGGING_COLOR == "0": + return False + if envs.VLLM_LOGGING_COLOR == "1": + return True + if envs.VLLM_LOGGING_STREAM == "ext://sys.stdout": # stdout + return hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + elif envs.VLLM_LOGGING_STREAM == "ext://sys.stderr": # stderr + return hasattr(sys.stderr, "isatty") and sys.stderr.isatty() + return False + + DEFAULT_LOGGING_CONFIG = { "formatters": { "vllm": { @@ -36,13 +43,19 @@ "datefmt": _DATE_FORMAT, "format": _FORMAT, }, + "vllm_color": { + "class": "vllm.logging_utils.ColoredFormatter", + "datefmt": _DATE_FORMAT, + "format": _FORMAT, + }, }, "handlers": { "vllm": { "class": "logging.StreamHandler", - "formatter": "vllm", - "level": VLLM_LOGGING_LEVEL, - "stream": VLLM_LOGGING_STREAM, + # Choose formatter based on color setting. + "formatter": "vllm_color" if _use_color() else "vllm", + "level": envs.VLLM_LOGGING_LEVEL, + "stream": envs.VLLM_LOGGING_STREAM, }, }, "loggers": { @@ -144,7 +157,7 @@ def warning_once( def _configure_vllm_root_logger() -> None: logging_config = dict[str, Any]() - if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: + if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH: raise RuntimeError( "VLLM_CONFIGURE_LOGGING evaluated to false, but " "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH " @@ -152,16 +165,22 @@ def _configure_vllm_root_logger() -> None: "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH." ) - if VLLM_CONFIGURE_LOGGING: + if envs.VLLM_CONFIGURE_LOGGING: logging_config = DEFAULT_LOGGING_CONFIG - if VLLM_LOGGING_CONFIG_PATH: - if not path.exists(VLLM_LOGGING_CONFIG_PATH): + vllm_handler = logging_config["handlers"]["vllm"] + # Refresh these values in case env vars have changed. + vllm_handler["level"] = envs.VLLM_LOGGING_LEVEL + vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM + vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm" + + if envs.VLLM_LOGGING_CONFIG_PATH: + if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH): raise RuntimeError( "Could not load logging config. File does not exist: %s", - VLLM_LOGGING_CONFIG_PATH, + envs.VLLM_LOGGING_CONFIG_PATH, ) - with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: + with open(envs.VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 44b40ead973b..8d3354df215b 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter from vllm.logging_utils.lazy import lazy from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "ColoredFormatter", "lazy", "logtime", ] diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index 02ba308e1879..3ad4ef8d119a 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -75,3 +75,53 @@ def shrink_path(relpath: Path) -> str: parts = msg.split(record.message) msg = msg.replace("\n", "\r\n" + parts[0]) return msg + + +class ColoredFormatter(NewLineFormatter): + """Adds ANSI color codes to log levels for terminal output. + + This formatter adds colors by injecting them into the format string for + static elements (timestamp, filename, line number) and modifying the + levelname attribute for dynamic color selection. + """ + + # ANSI color codes + COLORS = { + "DEBUG": "\033[37m", # White + "INFO": "\033[32m", # Green + "WARNING": "\033[33m", # Yellow + "ERROR": "\033[31m", # Red + "CRITICAL": "\033[35m", # Magenta + } + GREY = "\033[90m" # Grey for timestamp and file info + RESET = "\033[0m" + + def __init__(self, fmt, datefmt=None, style="%"): + # Inject grey color codes into format string for timestamp and file info + if fmt: + # Wrap %(asctime)s with grey + fmt = fmt.replace("%(asctime)s", f"{self.GREY}%(asctime)s{self.RESET}") + # Wrap [%(fileinfo)s:%(lineno)d] with grey + fmt = fmt.replace( + "[%(fileinfo)s:%(lineno)d]", + f"{self.GREY}[%(fileinfo)s:%(lineno)d]{self.RESET}", + ) + + # Call parent __init__ with potentially modified format string + super().__init__(fmt, datefmt, style) + + def format(self, record): + # Store original levelname to restore later (in case record is reused) + orig_levelname = record.levelname + + # Only modify levelname - it needs dynamic color based on severity + if (color_code := self.COLORS.get(record.levelname)) is not None: + record.levelname = f"{color_code}{record.levelname}{self.RESET}" + + # Call parent format which will handle everything else + msg = super().format(record) + + # Restore original levelname + record.levelname = orig_levelname + + return msg diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py index 5968884e232a..cc872040b6c5 100644 --- a/vllm/utils/system_utils.py +++ b/vllm/utils/system_utils.py @@ -22,7 +22,7 @@ logger = init_logger(__name__) -CYAN = "\033[1;36m" +CYAN = "\033[0;36m" RESET = "\033[0;0m" @@ -142,7 +142,10 @@ def set_process_title( def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: """Add colored prefix to file output for log decoration.""" - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + if envs.NO_COLOR: + prefix = f"({worker_name} pid={pid}) " + else: + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " file_write = file.write def write_with_prefix(s: str): From 1d642872a27f1c6bedf28669642928cc7eec6532 Mon Sep 17 00:00:00 2001 From: liangel-02 Date: Wed, 19 Nov 2025 19:39:45 -0500 Subject: [PATCH 217/578] [torchao] fix safetensors for sharding (#28169) Signed-off-by: Angel Li --- tests/quantization/test_torchao.py | 9 ++++---- .../model_loader/default_loader.py | 2 +- .../model_loader/weight_utils.py | 23 +++++++++++++++---- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index fb8d6130c377..f35c3973ab6e 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -225,13 +225,12 @@ def test_reload_weights(): @pytest.mark.skip( reason="since torchao nightly is only compatible with torch nightly" "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip " - "torchao tests that requires newer versions (0.14.0.dev+) for now" + "torchao tests that requires newer versions (0.15.0.dev+) for now" ) -def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner): +def test_safetensors_model_loading_with_params(vllm_runner): torch._dynamo.reset() - model_name = ( - "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors" - ) + # using this model to test safetensors loading with file sharding + model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors" with vllm_runner(model_name=model_name, dtype="bfloat16") as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=4) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index b80026741781..67aa584c6bda 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -279,7 +279,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: if ( hasattr(quant_config, "is_checkpoint_torchao_serialized") and quant_config.is_checkpoint_torchao_serialized - and torchao_version_at_least("0.14.0") + and torchao_version_at_least("0.15.0") ): self.load_config.safetensors_load_strategy = "torchao" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 89634cbf4124..4572ebe2ea11 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -595,6 +595,9 @@ def safetensors_weights_iterator( if safetensors_load_strategy == "eager": loading_desc += " (eager)" + state_dict = {} + leftover_state_dict: dict[str, torch.Tensor] = {} + for st_file in tqdm( hf_weights_files, desc=loading_desc, @@ -606,9 +609,11 @@ def safetensors_weights_iterator( state_dict = load(f.read()) yield from state_dict.items() elif safetensors_load_strategy == "torchao": - if not torchao_version_at_least("0.14.0"): + # we can't load flattened torchao tensor subclasses directly into the model + # instead we reconstruct the subclasses here before returning + if not torchao_version_at_least("0.15.0"): raise ValueError( - "Please use torchao version >= 0.14.0 \ + "Please use torchao version >= 0.15.0 \ to load torchao safetensors checkpoint" ) from torchao.prototype.safetensors.safetensors_support import ( @@ -616,12 +621,20 @@ def safetensors_weights_iterator( ) with safe_open(st_file, framework="pt") as f: - state_dict = {} for name in f.keys(): # noqa: SIM118 state_dict[name] = f.get_tensor(name) + + # update with leftover tensor data from previous iteration, if any + state_dict.update(leftover_state_dict) metadata = f.metadata() - updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata) - yield from updated_state_dict.items() + # due to sharded checkpoints, we are not guaranteed that we have all + # tensor subclass data on one file + # state_dict has the leftover data from this step and we wait for + # missing information to be provided in a future iteration + unflattened_state_dict, leftover_state_dict = ( + unflatten_tensor_state_dict(state_dict, metadata) + ) + yield from unflattened_state_dict.items() else: with safe_open(st_file, framework="pt") as f: for name in f.keys(): # noqa: SIM118 From 05c2dee7e9f485f1e76eee084849e07c1c12a68b Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Nov 2025 09:40:49 +0800 Subject: [PATCH 218/578] [DeepSeek + LMCache Multiprocess] handle MLA for deepseek model + LMCache Multiprocess connector (#29039) Signed-off-by: KuntaiDu --- .../kv_connector/v1/lmcache_mp_connector.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 55831dc56c80..22ddabbf1e35 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -7,6 +7,7 @@ import torch import zmq +from lmcache.integration.vllm.utils import mla_enabled from lmcache.utils import init_logger as lmcache_init_logger from vllm.config import VllmConfig @@ -60,17 +61,44 @@ def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]: return block_ids[0] +def extract_world_size_and_kv_rank( + world_size: int, + rank: int, + vllm_config: VllmConfig, +) -> tuple[int, int]: + """ + Convert the rank for the MLA. + """ + use_mla = mla_enabled(vllm_config.model_config) + if not use_mla: + return world_size, rank + else: + # Tensor parallel does not change the KV caches for MLA models. + # So we need to "exclude" the effect of TP on rank and world size + tp_size = vllm_config.parallel_config.tensor_parallel_size + # vLLM constructs TP groups first, and then construct other + # parallel groups on top of TP groups. + # for example, TP=4, PP=2, + # TP group: [0, 1, 2, 3], [4, 5, 6, 7] + # PP group: [0, 4], [1, 5], [2, 6], [3, 7] + # So we can "exclude" the effect of TP by rank // tp_size. + return world_size // tp_size, rank // tp_size + + def create_scheduler_adapter( server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig ) -> LMCacheMPSchedulerAdapter: - # TODO: have a helper function to calculate the correct rank and - # world size for the MLA and other models + world_size, kv_rank = extract_world_size_and_kv_rank( + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config, + ) return LMCacheMPSchedulerAdapter( server_url, zmq_context, vllm_config.model_config.model, - vllm_config.parallel_config.world_size, - vllm_config.parallel_config.rank, + world_size, + kv_rank, vllm_config.cache_config.block_size, ) @@ -78,14 +106,17 @@ def create_scheduler_adapter( def create_worker_adapter( server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig ) -> LMCacheMPWorkerAdapter: - # TODO: have a helper function to calculate the correct rank and - # world size for the MLA and other models + world_size, kv_rank = extract_world_size_and_kv_rank( + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config, + ) return LMCacheMPWorkerAdapter( server_url, zmq_context, vllm_config.model_config.model, - vllm_config.parallel_config.world_size, - vllm_config.parallel_config.rank, + world_size, + kv_rank, vllm_config.cache_config.block_size, ) From 3fb0d90999887949629d1e9bac4d98336a35c475 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Thu, 20 Nov 2025 10:11:52 +0800 Subject: [PATCH 219/578] [AMD] Use Decoupled Kernel Block Size to Support AITER MLA block_size=1 (#27715) Signed-off-by: chiangzhang --- vllm/attention/backends/abstract.py | 14 +++--- .../attention/backends/mla/rocm_aiter_mla.py | 45 +++---------------- 2 files changed, 13 insertions(+), 46 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index d28bc065852d..188becb6ad6f 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -119,14 +119,12 @@ def supports_block_size(cls, block_size: int | None) -> bool: return True for supported_size in cls.supported_kernel_block_sizes: - is_multiple_of = ( - isinstance(supported_size, MultipleOf) - and block_size % supported_size.base == 0 - ) - is_int_equal = ( - isinstance(supported_size, int) and block_size == supported_size - ) - if is_multiple_of or is_int_equal: + if isinstance(supported_size, MultipleOf): + supported_size = supported_size.base + # With hybrid_blocks feature, the framework-level block size + # only needs to be a multiple of the kernel's requirement, + # even if the kernel requires a fixed block_size. + if block_size % supported_size == 0: return True return False diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index e1864526f02c..6ccc1a341d56 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -7,9 +7,8 @@ import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionLayer +from vllm.attention.backends.abstract import AttentionLayer, MultipleOf from vllm.config import VllmConfig -from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, @@ -22,6 +21,8 @@ class AiterMLABackend(MLACommonBackend): + supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1] + @staticmethod def get_name() -> str: return "ROCM_AITER_MLA" @@ -71,9 +72,8 @@ def __init__( ) self.compilation_config = vllm_config.compilation_config - max_num_pages_per_req = cdiv( - vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size - ) + # kernel block size is always 1. + max_num_pages_per_req = vllm_config.model_config.max_model_len max_num_reqs = vllm_config.scheduler_config.max_num_seqs max_num_pages = max_num_reqs * max_num_pages_per_req @@ -82,11 +82,6 @@ def __init__( # so we can only use the persistent buffer if a cudagraph is actually # being used. if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): - self.block_table_remapping = torch.zeros( - [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size], - dtype=torch.int32, - device=device, - ) self.paged_kv_indptr = torch.zeros( max_num_reqs + 1, dtype=torch.int32, device=device ) @@ -111,36 +106,16 @@ def _build_decode( num_decode_tokens: int, dcp_tot_seq_lens_device: torch.Tensor | None, ) -> AiterMLADecodeMetadata: - page_size = self.kv_cache_spec.block_size + # kernel block size is always 1, although the kv block size is not 1. device = self.device num_reqs = seq_lens_device.size(0) - bs, _ = block_table_tensor.shape - block_table_tensor = ( - block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size - ) - block_table_tensor = ( - block_table_tensor - + torch.arange( - 0, - page_size, - device=block_table_tensor.device, - dtype=block_table_tensor.dtype, - )[None, None, :] - ) - block_table_tensor = block_table_tensor.view(bs, -1) - # after remapping, we assume the block size already equals to 1 - - max_blk_size_per_req = block_table_tensor.shape[-1] mask = torch.arange( block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device ).unsqueeze(0) < seq_lens_device.unsqueeze(1) paged_kv_indices = block_table_tensor[mask] - paged_kv_last_page_len = seq_lens_device % page_size - paged_kv_last_page_len = torch.where( - paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len - ) + paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device) paged_kv_indptr = torch.cat( [ @@ -151,12 +126,6 @@ def _build_decode( if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): num_actual_pages = paged_kv_indices.size(0) - self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_( - block_table_tensor, non_blocking=True - ) - block_table_tensor = self.block_table_remapping[ - :num_reqs, :max_blk_size_per_req - ] self.paged_kv_indices[:num_actual_pages].copy_( paged_kv_indices, non_blocking=True From 3168285fcaaee09bc93dce7bc9ae6ee823c71652 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Thu, 20 Nov 2025 02:37:09 +0000 Subject: [PATCH 220/578] [cpu][ci] Add initial set of tests for Arm CPUs (#28657) Signed-off-by: Fadi Arafeh --- .../scripts/hardware_ci/run-cpu-test-arm.sh | 64 +++++++++++++++++++ docker/Dockerfile.cpu | 10 +++ 2 files changed, 74 insertions(+) create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh new file mode 100755 index 000000000000..d0036f24c8d0 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-0-16} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} +NUMA_NODE=${NUMA_NODE:-0} + +export CMAKE_BUILD_PARALLEL_LEVEL=32 + +# Setup cleanup +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" || true; +} +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . + +# Run the image, setting --shm-size=4g for tensor parallel. +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" + +function cpu_tests() { + set -e + export NUMA_NODE=$2 + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + + # offline inference + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + + # basic online serving + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 4c961defaeda..eb3807ef0ca4 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh +ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12 ENV CCACHE_DIR=/root/.cache/ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -122,6 +123,15 @@ WORKDIR /workspace/vllm RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ + remove_packages_not_supported_on_aarch64() { \ + case "$(uname -m)" in \ + aarch64|arm64) \ + sed -i '/decord/d' requirements/cpu-test.in; \ + sed -i '/terratorch/d' requirements/cpu-test.in; \ + ;; \ + esac; \ + }; \ + remove_packages_not_supported_on_aarch64 && \ sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ From fcbcba6c70a3308705aa21adebb443bf9015b486 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Wed, 19 Nov 2025 22:17:48 -0500 Subject: [PATCH 221/578] [Feat] Iteration-level profiling for Torch and CUDA profiler (#28987) Signed-off-by: Benjamin Chislett Signed-off-by: Benjamin Chislett Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/v1/worker/test_gpu_profiler.py | 203 +++++++++++++++++++++++++ vllm/envs.py | 16 ++ vllm/profiler/gpu_profiler.py | 213 ++++++++++++++++++++++++--- vllm/v1/engine/async_llm.py | 14 +- vllm/v1/worker/gpu_worker.py | 50 ++----- 5 files changed, 435 insertions(+), 61 deletions(-) create mode 100644 tests/v1/worker/test_gpu_profiler.py diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py new file mode 100644 index 000000000000..f7255fae05a4 --- /dev/null +++ b/tests/v1/worker/test_gpu_profiler.py @@ -0,0 +1,203 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +import vllm.envs as envs +from vllm.profiler.gpu_profiler import WorkerProfiler + + +class ConcreteWorkerProfiler(WorkerProfiler): + """ + A basic implementation of a worker profiler for testing purposes. + """ + + def __init__(self): + self.start_call_count = 0 + self.stop_call_count = 0 + self.should_fail_start = False + super().__init__() + + def _start(self) -> None: + if self.should_fail_start: + raise RuntimeError("Simulated start failure") + self.start_call_count += 1 + + def _stop(self) -> None: + self.stop_call_count += 1 + + +@pytest.fixture(autouse=True) +def reset_mocks(): + """Fixture to reset mocks and env variables before each test.""" + envs.VLLM_PROFILER_DELAY_ITERS = 0 + envs.VLLM_PROFILER_MAX_ITERS = 0 + + +def test_immediate_start_stop(): + """Test standard start without delay.""" + profiler = ConcreteWorkerProfiler() + + profiler.start() + assert profiler._running is True + assert profiler._active is True + assert profiler.start_call_count == 1 + + profiler.stop() + assert profiler._running is False + assert profiler._active is False + assert profiler.stop_call_count == 1 + + +def test_delayed_start(): + """Test that profiler waits for N steps before actually starting.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + # User requests start + profiler.start() + + # Should be active (request accepted) but not running (waiting for delay) + assert profiler._active is True + assert profiler._running is False + assert profiler.start_call_count == 0 + + # Step 1 + profiler.step() + assert profiler._running is False + + # Step 2 (Threshold reached) + profiler.step() + assert profiler._running is True + assert profiler.start_call_count == 1 + + +def test_max_iterations(): + """Test that profiler stops automatically after max iterations.""" + envs.VLLM_PROFILER_MAX_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + profiler.start() + assert profiler._running is True + + # Iteration 1 + profiler.step() # profiling_count becomes 1 + assert profiler._running is True + + # Iteration 2 + profiler.step() # profiling_count becomes 2 + assert profiler._running is True + + # Iteration 3 (Exceeds max) + profiler.step() # profiling_count becomes 3 + + # Should have stopped now + assert profiler._running is False + assert profiler.stop_call_count == 1 + + +def test_delayed_start_and_max_iters(): + """Test combined delayed start and max iterations.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + envs.VLLM_PROFILER_MAX_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + profiler.start() + + # Step 1 + profiler.step() + assert profiler._running is False + assert profiler._active is True + + # Step 2 (Starts now) + profiler.step() + assert profiler._profiling_for_iters == 1 + assert profiler._running is True + assert profiler._active is True + + # Next iteration + profiler.step() + assert profiler._profiling_for_iters == 2 + assert profiler._running is True + + # Iteration 2 (exceeds max) + profiler.step() + + # Should have stopped now + assert profiler._running is False + assert profiler.stop_call_count == 1 + + +def test_idempotency(): + """Test that calling start/stop multiple times doesn't break logic.""" + profiler = ConcreteWorkerProfiler() + + # Double Start + profiler.start() + profiler.start() + assert profiler.start_call_count == 1 # Should only start once + + # Double Stop + profiler.stop() + profiler.stop() + assert profiler.stop_call_count == 1 # Should only stop once + + +def test_step_inactive(): + """Test that stepping while inactive does nothing.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + # Not started yet + profiler.step() + profiler.step() + + # Even though we stepped 2 times, start shouldn't happen because active=False + assert profiler.start_call_count == 0 + + +def test_start_failure(): + """Test behavior when the underlying _start method raises exception.""" + profiler = ConcreteWorkerProfiler() + profiler.should_fail_start = True + + profiler.start() + + # Exception caught in _call_start + assert profiler._running is False # Should not mark as running + assert profiler._active is True # Request is still considered active + assert profiler.start_call_count == 0 # Logic failed inside start + + +def test_shutdown(): + """Test that shutdown calls stop only if running.""" + profiler = ConcreteWorkerProfiler() + + # Case 1: Not running + profiler.shutdown() + assert profiler.stop_call_count == 0 + + # Case 2: Running + profiler.start() + profiler.shutdown() + assert profiler.stop_call_count == 1 + + +def test_mixed_delay_and_stop(): + """Test manual stop during the delay period.""" + envs.VLLM_PROFILER_DELAY_ITERS = 5 + profiler = ConcreteWorkerProfiler() + + profiler.start() + profiler.step() + profiler.step() + + # User cancels before delay finishes + profiler.stop() + assert profiler._active is False + + # Further steps should not trigger start + profiler.step() + profiler.step() + profiler.step() + + assert profiler.start_call_count == 0 diff --git a/vllm/envs.py b/vllm/envs.py index 614bc94b978b..888a09cf6d3e 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -92,11 +92,14 @@ VLLM_TORCH_PROFILER_DIR: str | None = None VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False + VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False VLLM_TORCH_PROFILER_WITH_STACK: bool = True VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False + VLLM_PROFILER_DELAY_ITERS: int = 0 + VLLM_PROFILER_MAX_ITERS: int = 0 VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -872,6 +875,19 @@ def get_vllm_port() -> int | None: "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" ), + # Disable torch profiling of the AsyncLLMEngine process. + # If set to 1, will not profile the engine process. + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0" + ), + # Delay number of iterations before starting profiling when using + # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. + "VLLM_PROFILER_DELAY_ITERS": lambda: int( + os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") + ), + # Maximum number of iterations to profile when using the torch/torch CUDA profiler. + # If set to 0, will not limit the number of iterations. + "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), # If set, allow loading or unloading lora adapters in runtime, diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py index 58c668953161..2155b67a3db4 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/gpu_profiler.py @@ -1,37 +1,212 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from contextlib import nullcontext + +import torch +from typing_extensions import override + +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) -class CudaProfilerWrapper: +class WorkerProfiler(ABC): def __init__(self) -> None: - self._profiler_running = False - # Note: lazy import to avoid dependency issues if CUDA is not available. - import torch.cuda.profiler as cuda_profiler + self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS + if self._delay_iters > 0: + logger.info_once( + "GPU profiling will start " + f"{self._delay_iters} steps after start_profile." + ) - self._cuda_profiler = cuda_profiler + self._max_iters = envs.VLLM_PROFILER_MAX_ITERS + if self._max_iters > 0: + logger.info_once( + "GPU profiling will stop " + f"after {self._max_iters} worker steps, " + "or when stop_profile is received." + ) - def start(self) -> None: + # Track when the profiler gets triggered by start_profile + self._active_iteration_count = 0 + self._active = False + + # Track when the profiler is actually running + self._profiling_for_iters = 0 + self._running = False + + @abstractmethod + def _start(self) -> None: + """Start the profiler.""" + pass + + @abstractmethod + def _stop(self) -> None: + """Stop the profiler.""" + pass + + def _call_start(self) -> None: + """Call _start with error handling but no safeguards.""" try: - self._cuda_profiler.start() - self._profiler_running = True - logger.info_once("Started CUDA profiler") + self._start() + self._running = True # Only mark as running if start succeeds except Exception as e: - logger.warning_once("Failed to start CUDA profiler: %s", e) + logger.warning("Failed to start profiler: %s", e) + + def _call_stop(self) -> None: + """Call _stop with error handling but no safeguards.""" + try: + self._stop() + logger.info("Profiler stopped successfully.") + except Exception as e: + logger.warning("Failed to stop profiler: %s", e) + self._running = False # Always mark as not running, assume stop worked + + def start(self) -> None: + """Attempt to start the profiler, accounting for delayed starts.""" + if self._active: + logger.debug( + "start_profile received when profiler is already active. " + "Ignoring request." + ) + return + self._active = True + if self._delay_iters == 0: + self._call_start() + + def step(self) -> None: + """Update the profiler state at each worker step, + to handle delayed starts and max iteration limits.""" + if not self._active: + return + + self._active_iteration_count += 1 + + if ( + not self._running + and self._delay_iters > 0 + and self._active_iteration_count == self._delay_iters + ): + logger.info("Starting profiler after delay...") + self._call_start() + + if self._running: + self._profiling_for_iters += 1 + + if ( + self._max_iters > 0 + and self._running + and self._profiling_for_iters > self._max_iters + ): + # Automatically stop the profiler after max iters + # will be marked as not running, but leave as active so that stop + # can clean up properly + logger.info("Max profiling iterations reached. Stopping profiler...") + self._call_stop() + return def stop(self) -> None: - if self._profiler_running: - try: - self._cuda_profiler.stop() - logger.info_once("Stopped CUDA profiler") - except Exception as e: - logger.warning_once("Failed to stop CUDA profiler: %s", e) - finally: - self._profiler_running = False + """Attempt to stop the profiler, accounting for overlapped calls.""" + if not self._active: + logger.debug( + "stop_profile received when profiler is not active. Ignoring request." + ) + return + self._active = False + self._active_iteration_count = 0 + self._profiling_for_iters = 0 + + if self._running: + self._call_stop() def shutdown(self) -> None: """Ensure profiler is stopped when shutting down.""" - self.stop() + logger.info_once("Shutting down profiler") + if self._running: + self.stop() + + def annotate_context_manager(self, name: str): + """Return a context manager to annotate profiler traces.""" + return nullcontext() + + +class TorchProfilerWrapper(WorkerProfiler): + def __init__(self, worker_name: str, local_rank: int) -> None: + super().__init__() + + self.local_rank = local_rank + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info( + "Torch profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir, + ) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True + ), + ) + + @override + def _start(self) -> None: + self.profiler.start() + + @override + def _stop(self) -> None: + self.profiler.stop() + + rank = self.local_rank + profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" + sort_key = "self_cuda_time_total" + table = self.profiler.key_averages().table(sort_by=sort_key) + + with open(profiler_out_file, "w") as f: + print(table, file=f) + + # only print profiler results on rank 0 + if rank == 0: + print(table) + + @override + def annotate_context_manager(self, name: str): + return torch.profiler.record_function(name) + + +class CudaProfilerWrapper(WorkerProfiler): + def __init__(self) -> None: + super().__init__() + # Note: lazy import to avoid dependency issues if CUDA is not available. + import torch.cuda.profiler as cuda_profiler + + self._cuda_profiler = cuda_profiler + + @override + def _start(self) -> None: + self._cuda_profiler.start() + + @override + def _stop(self) -> None: + self._cuda_profiler.stop() + + @override + def annotate_context_manager(self, name: str): + return torch.cuda.nvtx.range(name) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c160c7cbcab4..abf2c8cfa453 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -160,11 +160,23 @@ def __init__( except RuntimeError: pass - if envs.VLLM_TORCH_PROFILER_DIR: + if ( + envs.VLLM_TORCH_PROFILER_DIR + and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM + ): logger.info( "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 envs.VLLM_TORCH_PROFILER_DIR, ) + if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0: + logger.warning_once( + "Torch profiler received max_iters or delay_iters setting. These " + "are not compatible with the AsyncLLM profiler and will be ignored " + "for the AsyncLLM process. Engine process profiling will still " + "respect these settings. Consider setting " + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable " + "AsyncLLM profiling." + ) worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" self.profiler = torch.profiler.profile( activities=[ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 7f9cdd221224..18cbc3826279 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -36,7 +36,7 @@ from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform -from vllm.profiler.gpu_profiler import CudaProfilerWrapper +from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes @@ -90,32 +90,9 @@ def __init__( # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, - ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True - ), + self.profiler = TorchProfilerWrapper( + worker_name=worker_name, local_rank=self.local_rank ) elif envs.VLLM_TORCH_CUDA_PROFILE: self.profiler = CudaProfilerWrapper() @@ -526,10 +503,12 @@ def annotate_profile(self, scheduler_output): if not self.profiler: return nullcontext() + self.profiler.step() + num_new = len(scheduler_output.scheduled_new_reqs) num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids) - return torch.profiler.record_function( + return self.profiler.annotate_context_manager( f"execute_new_{num_new}_cached_{num_cached}" ) @@ -587,24 +566,11 @@ def take_draft_token_ids(self) -> DraftTokenIds | None: def profile(self, is_start: bool = True): if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") + raise RuntimeError("Profiling is not enabled.") if is_start: self.profiler.start() else: self.profiler.stop() - if isinstance(self.profiler, torch.profiler.profile): - rank = self.local_rank - profiler_dir = envs.VLLM_TORCH_PROFILER_DIR - profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" - sort_key = "self_cuda_time_total" - table = self.profiler.key_averages().table(sort_by=sort_key) - - with open(profiler_out_file, "w") as f: - print(table, file=f) - - # only print profiler results on rank 0 - if rank == 0: - print(table) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1, uniform_decode=True) @@ -865,6 +831,8 @@ def save_tensorized_model( def shutdown(self) -> None: if runner := getattr(self, "model_runner", None): runner.ensure_kv_transfer_shutdown() + if self.profiler is not None: + self.profiler.shutdown() def init_worker_distributed_environment( From a8c536829cb7b5564f54beff97e938666f286dd6 Mon Sep 17 00:00:00 2001 From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:39:36 -0800 Subject: [PATCH 222/578] Consolidate Nvidia ModelOpt quant config handling for all quantization methods (#28076) Signed-off-by: Shengliang Xu --- .../layers/quantization/modelopt.py | 499 ++++++++---------- 1 file changed, 234 insertions(+), 265 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index dedab33c1bdb..6b5ed7762eb3 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from fnmatch import fnmatch from typing import TYPE_CHECKING, Any, Optional import torch @@ -13,7 +14,6 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, @@ -86,45 +86,218 @@ KV_CACHE_QUANT_ALGOS = ["FP8"] -class ModelOptFp8Config(QuantizationConfig): +class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: "ModelOptQuantConfigBase"): + super().__init__(quant_config) + + +class ModelOptQuantConfigBase(QuantizationConfig): + LinearMethodCls: type = LinearMethodBase + FusedMoEMethodCls: type = FusedMoEMethodBase + KVCacheMethodCls: type = BaseKVCacheMethod + + def __init__( + self, + exclude_modules: list[str], + ): + super().__init__() + self.exclude_modules: list[str] = exclude_modules + + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + + Handles both exact matching (for fused layers) and ModelOpt wildcard matching. + + The ModelOpt exclude_modules list is a list of wildcards. + """ + if len(self.exclude_modules) == 0: + return False + + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): + return True + + # TODO: This special hard coded logic is not needed for quantized checkpoints + # generated by ModelOpt >= 0.39.0 where they are handled natually by the + # exclude_modules config. But need to keep them for loading quantized + # checkpoints generated by older versions. Then check substring matching + # for patterns not caught by exact match + for exclude_module in self.exclude_modules: + # Skip exact matches already handled above + if exclude_module != prefix and ( + exclude_module in prefix + or ( + prefix.startswith("language_model.") + and exclude_module in prefix.removeprefix("language_model.") + ) + ): + return True + + # modelopt exclude modules are not simple strings, they are wildcards + for wildcard_pattern in self.exclude_modules: + if fnmatch(prefix, wildcard_pattern): + return True + + return False + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + # handle kv-cache first so we can focus only on weight quantization thereafter + if isinstance(layer, Attention): + return self.KVCacheMethodCls(self) + + # handle exclusion + if self.is_layer_excluded(prefix): + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + return None + + # TODO: This special hard coded logic is not needed for quantized checkpoints + # generated by ModelOpt >= 0.39.0 where they are handled natually by the + # exclude_modules config. But need to keep them for loading quantized + # checkpoints generated by older versions. Then check substring matching + # for patterns not caught by exact match + if "vision_tower" in prefix or "vision_model" in prefix: + return UnquantizedLinearMethod() + + # now, the layer is quantized, handle it here + if isinstance(layer, LinearBase): + return self.LinearMethodCls(self) + elif isinstance(layer, FusedMoE): + return self.FusedMoEMethodCls(quant_config=self, layer=layer) + + return None + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if len(self.exclude_modules) > 0: + self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) + + @staticmethod + def get_config_filenames() -> list[str]: + return ["hf_quant_config.json"] + + @classmethod + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + group_size: int | None, + ) -> "ModelOptQuantConfigBase": + raise NotImplementedError("Please implement this function in sub classes") + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase": + # Handle both ModelOpt format and compressed-tensors style format + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError("Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules", []) + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo") + kv_cache_quant_method = config.get("kv_cache_quant_algo") + # "ignore" is the key in config.json + exclude_modules = config.get("ignore", []) + group_size_raw = config.get("group_size") + + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + if kv_cache_quant_method is None: + # No KV cache quantization, keep this branch just to have this comment + pass + elif not isinstance(kv_cache_quant_method, str): + raise ValueError( + f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_method)}" + ) + + if not isinstance(exclude_modules, list): + raise ValueError( + f"exclude_modules must be a list, got {type(exclude_modules)}" + ) + + if group_size_raw is None: + group_size = None + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError( + f"group_size must be an integer, got {type(group_size_raw)}" + ) from None + + if quant_method not in QUANT_ALGOS: + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration." + ) + return cls._from_config( + quant_method=quant_method, + kv_cache_quant_method=kv_cache_quant_method, + exclude_modules=exclude_modules, + group_size=group_size, + original_config=config, + ) + + +class ModelOptFp8Config(ModelOptQuantConfigBase): """Config class for ModelOpt FP8.""" def __init__( self, - is_checkpoint_fp8_serialized: bool = False, - kv_cache_quant_method: str | None = None, - exclude_modules: list[str] | None = None, + is_checkpoint_fp8_serialized: bool, + kv_cache_quant_method: str | None, + exclude_modules: list[str], ) -> None: - super().__init__() + super().__init__(exclude_modules) self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method - self.exclude_modules = exclude_modules or [] if is_checkpoint_fp8_serialized: logger.warning( "Detected ModelOpt fp8 checkpoint. Please note that" " the format is experimental and could change." ) - @classmethod - def get_name(cls) -> QuantizationMethods: + def get_name(self) -> QuantizationMethods: return "modelopt" - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod def get_min_capability(cls) -> int: return 89 - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["hf_quant_config.json"] - - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - if self.exclude_modules is not None: - self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) - @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant @@ -158,88 +331,19 @@ def override_quantization_method( return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": - # Handle both ModelOpt format and compressed-tensors style format - if "quantization" in config: - # ModelOpt format: {"quantization": {"quant_algo": "..."}} - quant_config = cls.get_from_keys(config, ["quantization"]) - if not isinstance(quant_config, dict): - raise ValueError("Expected 'quantization' to be a dictionary in config") - quant_method = quant_config.get("quant_algo", "") - if not quant_method: - raise ValueError("Missing 'quant_algo' in quantization config") - kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") - # "exclude_modules" is the key in the legacy hf_quant_config.json - exclude_modules = quant_config.get("exclude_modules") - else: - # Compressed-tensors style format: - # {"quant_algo": "...", "quant_method": "modelopt"} - quant_method = config.get("quant_algo", "") - kv_cache_quant_method = config.get("kv_cache_quant_algo") - # "ignore" is the key in config.json - exclude_modules = config.get("ignore") - - if quant_method not in QUANT_ALGOS: - raise ValueError( - f"ModelOpt currently only supports: {QUANT_ALGOS} " - "quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration." - ) + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + **kwargs: Any, + ) -> "ModelOptFp8Config": is_checkpoint_fp8_serialized = "FP8" in quant_method return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) - def is_layer_excluded(self, prefix: str) -> bool: - """ - Check if a layer should be excluded from quantization. - Handles both exact matching (for fused layers) and substring matching. - - This method handles both regular models and multimodal models that use - the language_model prefix. For multimodal models, it checks if the - module name (without the language_model prefix) is in the exclude list. - """ - if self.exclude_modules is None: - return False - - # First check exact matching with fused layer support - if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): - return True - - # Then check substring matching for patterns not caught by exact match - for module in self.exclude_modules: - # Skip exact matches already handled above - if module != prefix and ( - module in prefix - or ( - prefix.startswith("language_model.") - and module in prefix.removeprefix("language_model.") - ) - ): - return True - return False - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import ( # Avoid circular import - Attention, - MLAAttention, - ) - - if isinstance(layer, LinearBase): - if self.is_layer_excluded(prefix): - return UnquantizedLinearMethod() - # Check if this is a vision model layer that should not be quantized - if "vision_tower" in prefix or "vision_model" in prefix: - return UnquantizedLinearMethod() - return ModelOptFp8LinearMethod(self) - elif isinstance(layer, (Attention, MLAAttention)): - return ModelOptFp8KVCacheMethod(self) - elif isinstance(layer, FusedMoE): - return ModelOptFp8MoEMethod(self, layer) - return None - class ModelOptFp8LinearMethod(LinearMethodBase): """Linear method for Model Optimizer static quantization. @@ -344,7 +448,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): def __init__( self, quant_config: ModelOptFp8Config, - layer: torch.nn.Module, + layer: FusedMoE, ) -> None: super().__init__(layer.moe_config) self.layer = layer @@ -686,7 +790,12 @@ def apply( ) -class ModelOptNvFp4Config(QuantizationConfig): +ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod +ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod +ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod + + +class ModelOptNvFp4Config(ModelOptQuantConfigBase): """Config class for ModelOpt FP4.""" def __init__( @@ -696,7 +805,7 @@ def __init__( exclude_modules: list[str], group_size: int = 16, ) -> None: - super().__init__() + super().__init__(exclude_modules) self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized if is_checkpoint_nvfp4_serialized: logger.warning( @@ -706,28 +815,17 @@ def __init__( self.group_size = group_size self.kv_cache_quant_algo = kv_cache_quant_algo - self.exclude_modules = exclude_modules - @classmethod - def get_name(cls) -> QuantizationMethods: + def get_name(self) -> QuantizationMethods: return "modelopt_fp4" - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.bfloat16, torch.half, torch.float8_e4m3fn] @classmethod def get_min_capability(cls) -> int: return 80 - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["hf_quant_config.json"] - - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - if self.exclude_modules is not None: - self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) - @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant @@ -761,105 +859,25 @@ def override_quantization_method( return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": - # Handle both traditional ModelOpt format and compressed-tensors - # style format - if "quantization" in config: - # Traditional ModelOpt format: - # {"quantization": {"quant_algo": "..."}} - quant_config = cls.get_from_keys(config, ["quantization"]) - if not isinstance(quant_config, dict): - raise ValueError("Expected 'quantization' to be a dictionary in config") - - quant_method = quant_config.get("quant_algo", "") - if not quant_method: - raise ValueError("Missing 'quant_algo' in quantization config") - - # Handle kv_cache_quant_algo with proper type validation - kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") - if kv_cache_quant_algo_raw is None: - # No KV cache quantization by default - kv_cache_quant_algo = None - elif isinstance(kv_cache_quant_algo_raw, str): - kv_cache_quant_algo = kv_cache_quant_algo_raw - else: - raise ValueError( - f"kv_cache_quant_algo must be a string, got " - f"{type(kv_cache_quant_algo_raw)}" - ) - - # Handle group_size with proper type validation - group_size_raw = quant_config.get("group_size") - if group_size_raw is None: - group_size = 16 # Default value - elif isinstance(group_size_raw, int): - group_size = group_size_raw - else: - try: - group_size = int(group_size_raw) - except (ValueError, TypeError): - raise ValueError( - f"group_size must be an integer, got {type(group_size_raw)}" - ) from None - - # "exclude_modules" is the key in the legacy hf_quant_config.json - exclude_modules = quant_config.get("exclude_modules", []) - if not isinstance(exclude_modules, list): - raise ValueError( - f"exclude_modules must be a list, got {type(exclude_modules)}" - ) - else: - # Compressed-tensors style format: - # {"quant_algo": "...", "quant_method": "modelopt"} - quant_method = config.get("quant_algo", "") - - # Handle kv_cache_quant_algo with proper type validation - kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo") - if kv_cache_quant_algo_raw is None: - # No KV cache quantization by default - kv_cache_quant_algo = None - elif isinstance(kv_cache_quant_algo_raw, str): - kv_cache_quant_algo = kv_cache_quant_algo_raw - else: - raise ValueError( - f"kv_cache_quant_algo must be a string, got " - f"{type(kv_cache_quant_algo_raw)}" - ) - - # Handle group_size with proper type validation - group_size_raw = config.get("group_size") - if group_size_raw is None: - group_size = 16 # Default value - elif isinstance(group_size_raw, int): - group_size = group_size_raw - else: - try: - group_size = int(group_size_raw) - except (ValueError, TypeError): - raise ValueError( - f"group_size must be an integer, got {type(group_size_raw)}" - ) from None - - # "ignore" is the key in config.json - exclude_modules = config.get("ignore", []) - if not isinstance(exclude_modules, list): - raise ValueError( - f"exclude_modules must be a list, got {type(exclude_modules)}" - ) - - if quant_method not in QUANT_ALGOS: - raise ValueError( - f"ModelOpt currently only supports: {QUANT_ALGOS} " - "quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration." - ) + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + group_size: int | None, + **kwargs: Any, + ) -> "ModelOptNvFp4Config": is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method + if group_size is None: + group_size = 16 # Default value + # For FP4, these fields are required - if is_checkpoint_nvfp4_serialized and "quantization" in config: + if is_checkpoint_nvfp4_serialized and "quantization" in original_config: # Check if required fields are present in the quantization config - quant_config = config["quantization"] + quant_config = original_config["quantization"] required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"] missing_fields = [ field for field in required_fields if field not in quant_config @@ -872,64 +890,11 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": return cls( is_checkpoint_nvfp4_serialized, - kv_cache_quant_algo, + kv_cache_quant_method, exclude_modules, group_size, ) - def is_layer_excluded(self, prefix: str) -> bool: - """ - Check if a layer should be excluded from quantization. - Handles both exact matching (for fused layers) and pattern matching. - """ - # First check exact matching with fused layer support - if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): - return True - - # Check regex pattern matching for patterns not caught by exact match - import regex as re - - for pattern in self.exclude_modules: - # Skip patterns that would be caught by exact matching - if "*" in pattern or "." in pattern: - regex_str = pattern.replace(".", r"\.").replace("*", r".*") - if re.fullmatch(regex_str, prefix): - return True - return False - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import ( # Avoid circular import - Attention, - MLAAttention, - ) - - skip_layer = self.is_layer_excluded(prefix) - if isinstance(layer, LinearBase): - if skip_layer: - return UnquantizedLinearMethod() - # Check if this is a vision model layer that should not be quantized - if "vision_tower" in prefix or "vision_model" in prefix: - return UnquantizedLinearMethod() - return ModelOptNvFp4LinearMethod(self) - elif isinstance(layer, (Attention, MLAAttention)): - return ModelOptFp8KVCacheMethod(self) - elif isinstance(layer, FusedMoE): - if skip_layer: - return None - return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) - return None - - -class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): - """ - Supports loading kv-cache scaling factors from FP8 checkpoints. - """ - - def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config): - super().__init__(quant_config) - class ModelOptNvFp4LinearMethod(LinearMethodBase): """Linear method for Model Optimizer NVFP4. @@ -1157,14 +1122,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def __init__( self, quant_config: ModelOptNvFp4Config, - moe: FusedMoEConfig, - layer: torch.nn.Module, + layer: FusedMoE, ) -> None: from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( detect_nvfp4_moe_support, # noqa: E501 ) - super().__init__(moe) + super().__init__(layer.moe_config) self.quant_config = quant_config self.layer = layer _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) @@ -1802,3 +1766,8 @@ def apply( k=x.shape[1], e=layer.w13_weight.shape[0], ) + + +ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod +ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE +ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod From 0cca9b4d130b4caddb60086ef26a0d8741582dcb Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Wed, 19 Nov 2025 19:50:37 -0800 Subject: [PATCH 223/578] [Bugfix] Fix precision loss in LoRA-wrapped RowParallelLinear by fusing bias into GEMM (#28972) Signed-off-by: prashanth058 --- vllm/lora/layers/row_parallel_linear.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 2ef1bd98fc61..95517b1aee26 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -63,23 +63,18 @@ def forward( input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. - output_parallel = self.apply(input_parallel) + bias_ = ( + None + if (self.tp_rank > 0 or self.base_layer.skip_bias_add) + else self.base_layer.bias + ) + output_parallel = self.apply(input_parallel, bias_) if self.base_layer.reduce_results and self.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) - else: - output_ = output_parallel - - if not self.base_layer.skip_bias_add: - output = ( - output_ + self.base_layer.bias - if self.base_layer.bias is not None - else output_ - ) - output_bias = None + output = tensor_model_parallel_all_reduce(output_parallel) else: - output = output_ - output_bias = self.base_layer.bias + output = output_parallel + output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None if not self.base_layer.return_bias: return output @@ -120,7 +115,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: return lora_b def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape From fe25772aa97beb8bcb07ea49e06a2892b521a7ed Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Thu, 20 Nov 2025 12:38:12 +0800 Subject: [PATCH 224/578] [Bugfix] Handle broken frames in video loading (#29001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: gcanlin Signed-off-by: 凌葭 Co-authored-by: 凌葭 --- tests/multimodal/assets/corrupted.mp4 | Bin 0 -> 91678 bytes tests/multimodal/test_video.py | 37 ++++++++ vllm/multimodal/video.py | 118 ++++++++++++++++---------- 3 files changed, 112 insertions(+), 43 deletions(-) create mode 100644 tests/multimodal/assets/corrupted.mp4 diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c355bb932ceeeae13cc2d0a4752dcdf8c5136720 GIT binary patch literal 91678 zcmYJZV{|A@6D=IuImyY1ZQHhO+qP}nwr$(CZQHqd-uvOl^z7cftGcSXSI=5A0{{Sk zYwYM|Yvy2M1poj9@ZbISqStdaptG`Nqyqo|fH1Z zTQ^H1V@F(iT3QAwdRls>U((dk(UzTt#>K^j+S$y|$i`C7n%c(RgyuiL)TWMBmcJSs zTSqe+YX^2*13i5`11>sTdn02m23$iUeM=hy3obf#T6S7oJ!?HnHwPmwT2}^kT30$c z23#v6E>j~{Tn8upU&MlI>)`fl`rGQ*8*)1u11DB|9PbQwa~HGvo+y4~**MXg_j*YRggOMW_6+Nz_sr|2o0~aH%rHze+p6PE#=l|y%EX@pl zWBGpxI^6#e|EaC*|2LU|nU$X7e*u|UI~v(r>iw#Jd3{SKdp$QD0~;$_J;&d^!Edb` z?e)y8e|`N5?e+fC7~AVv89Dq`O<%{>?H8LFa?$@{JwrX)|I*Rd(Kpj`_-_(3d!zp) z=wf7MV(O^>J7!~RWUXUjWBWV$e@WY4Qwt-vU*BBxOtk+G)Uh(N=3>HiFfg($!3H8HdOU98`Y@w=A4zVz&V_rZUw3g8O>fY@jh8VI2A zvu{t=>S!UFYi^@R=LI9)fqSS^NNgmVo^)3?V7)u;f_ak5?>@9YGvrl$R^ znP+^J6xq+sI(DsF*>p0}8m$B!x|G#0)yvyB*iXl?@Or`Yz`G`-HB+OkhDMV1;)PLC zSz1;b?dP57Djj$Se7RT0_wA7Mm`gZ$YLd-*qO?CtRtY5kPU4BD2v4#CqBPwT7f8mg zCpO_(cuNn|M$AX68D}i}01(x?OPbRE5-82_*UUFnTfBv{Z<_2J(Wd9-uHKzoeJVP> zwpoyH!%lTPtfZ_S*j1T!Wgsa}9RPcOzaE20sc0%+TUyvjtt2bliJ6@T6@A5cF>Q71 z6(1X&ZeTWgy^Y?9Wg6QAe*Of`Y+Eu`WIPe{(tN)?p8`D)SUAt0a0B-2-&K>%ji|YDb3Bf0n3DIi-dAh!6@CXCUM({{(09A1@udG z|KuI>FZJ(Eo@PX#3kmcv5UESADcZ2)68a`2k~ww)uYaDAv_ll%QAQ9CF3*D^p_gs5 z2Sy!a~{V~ zKI_|2SH8ZAXTk%1jmc{-1wUT15Fj2lB4}#V$GMexgIt%yKODUZm~BmXr+ z12#b+=M&P5_5_Td9Qn>=nhszeJ45>+GN{hO#$z_XTo?}%KR&*T#iNg+W=i6ok>QuB zTJo1d{iR!UH3GN8C-YQ|%~M3qE(Y7$cS(7)(5$-Lqp<2JyQho+&!yNMQf80m*eH(j zfykGPM2{WSDBKTh&GNg}@iuwKqg_5T+68&sn_^XA?l};k45G$fL|k17?Ri#iK)W*7 zAfqzQ9&oA`ta)IE)!;8xO%F_Y&I(UGgpkNSf_78S#TUG+^rbwx!}u-X^lL?EHh0zx zw-UUIM%W)w1avc)G0Pk{O>}6bnB`5N zn!C}F^OMpE_@zgR7p3}c+9lV z!9Kff*w{U_bdX9(5#Iw+xqs8KB&=JywC5J?vwRRX?g~m%1>7d1@Fanjw%s_FonMr` zrZK69#I{l@j3Yl2?RVVz8%gIyJWYa>O#}N}ib6L9{U~rB3D*IDO)I;ZC{a~Z^8Y{a}_qp5rE+O7;SLEAvQB!Fc_3+_r z3HeJlAog}TLRVql6#8+_N3-_$UVe5$$_lBi7vmM9tpur%g6W^{$}4G0Og|CA?+2N; zWc?OPqla=FXN+fb`KvDX*k;mb;P+PK-V%#?3^9&-X;F?2r(xOv*oUktl0@SFxp3|} zhS$eBNf{YTuLaYN#Kt4^20}FP;M5R*)&VW~s2bF~3KpoXv`Y8MwxdH6owenHB2J{#O14x_=7{|-%Fi!X!Ykwoh%&@Lo z+t`3QBq}YA?~&v#i24ilY>AKwDv_j&OKv#@kWrZ`Kl>OP%%t~I zs`C?efC3iaPKmVCf9F$T#wI$niT~pe6UO5<#}i-FR!fo1YKrx4?_ktu+y%A!QMAC` ze#>)W^3Qgc%c+HKGMVZ)_Uc8Bdy%D8W!)so2~j)4Fs*4&x|jB}lZbE{NuzR?OpJ4} z$hzzX6pMb&_QxrTh^`x(Sw?YNphH#5``W`{dyQbMJ|_+UpKC|0JWmcUxmO&{{Vr+&)?%7_W>one>7&-OEpwf<`F4b+3p&w7u zzagF-q*NkB~y~c0`Xdbc6`oq2P-TgbehL z1jSlN)Ct=+`u-$P`DLZEcW>cvMvKvF{kSre4LN$d@6zI zO7a*BflU9;4JR6cceR)y!%M%Y(TadpmM?ZS&1Okh%^$c@%H*sYc~(O|6YlW!=z!N- zXOfrLl?;4{!gJ38uXRpnEHU5O!a}5h9w!mAqZJB@EeM0az^*@PndcwM(roY{Z=9E$ zhfg}Q@M;$yG~@yq-FCiiwOzbqjr>oLc3nWZO96eXUpPw>MerpWe_@M;x%Y$afh5aiO5XBv*j3_L!BNiPg=V=1@)SKi%>I zcu%NE-s%H-K#`u1kO_|ijPp$s_oEbhZ4xAP795;Hk1C_hS@6(%uwzTK(zzsm(oXcD z_x5Ev!1{kejVRq(Q4c~%Ro7wpQ^Q=0LgQlQimfmbyg(H5GHzM0vpha%-HG!OMF`)r zbFNq>D1w4Gyf`Hm1KG$fOoA>70_VnP(FXr{XX$4>E4RSz?z9 zpeeF*LRs|^_HZ;RUhMJkdn}$KO>SKFq5?$1k-DxjK~l=x>ZRuyjd9%%1Q#9)_a(C~ z=DBjFv7_&UjkxR>CUboR^_Tv6aunJl0g@#@UHYEpbDmYH9CCi~%FL|xoIlt&P_+TA zd^$MBM8AeRN%!wgz4z}Tlb82}<10pgKdGyR-r9(t-^yOzZXD&=>fYzaR?r-KXste) zT4qp36>;In=yDKz!H7{(1~Sx8D~c%MMP(_dsA^nL*LeRlF^ z!+6_pEq;i{B>m%pZ%8+oO=$w&U8MQ#nUqEM6QR@E<$`@$P4oU5HM3mrAt5H1uCbz5 z_xqt;%+aF2tiYOp+uh=*CZPGS1*+^}M&<#);~H2+#PUuCe(3Ny+c5}`KY4~iqAu{z zOu|Vy#VgG{_O=67L_Lp)K*(*Cn%oGI81N?j%|nKo199#T1PdP8zFNqrB{y{(WyvM1 zvx7EdoZNvXE_b!vAx4-*Mj+>rbeB4+ti^`xzQtZT=WG!}^MxY%R6rr)(cBrawttK| zvl~wR%Hq{Kx)+N_a@M5Ja}S+r7tOpi;EjpBsMw!Zip@Rnr$UNeNzMU^&U~ejOEI@- zp(Re=#HmHNu;nGnxaF5bk~1x%&p&YpaOIXbiL|`!q#_#3YesksIcg@WVHqfoao^{i zVow|x3m%gU0qc|Re63+!RE1M#U$iWp!ftqHB_+DuyC}P*BG}T-YiBkwj(2241!A<{_=&nQUFDyd(Mve+jA zef6HrI>C>RzJ(m-$KCdAv78V_hmfoXC9!ok8?c8(X(y6kYd2^!qp=CEiB8}zbN?gs z=+D-N5#eP8iPQ%PnZTT2@fY<`=A)kk< z3*F&J$33}{SIn&%8%2kCytn_$jEF{Q-x@L)L6V8s&-Ap8Mm1W}vQKrVORnv=dq&0k z{juc2_SyanDs4YtMUd4UI^76$49x&Ym9%1tXOLY_OErSyQ^afqSwpK4ZZ5roD|pz)SQHm+FD9%C{1~M zpnbbl;JbzEniy^#yep>sA+cj-B38OxqZ_+6su4OXJ^qO+%|@BUE!$%(xabAK?7R6i z(cs|6EA2@ZM6}n!kqKH>rnIlN8^_(0LHZ!U}+qQxScef^YxM6WK}Wbc$w8&kFiS^OZ2(AW3WP`?N@9L7m4J+l03*pMLBTr|s=<+KWgW zpE8U}@5qW_p#TZ5o}G9196NYq<9R;wZbu1lhW8*vVy?cjb1qrSnOmePp-YdSt2ToL zQ>(|W^B3k_vWfO0L$OA;S@}EPgwF!k%>t@!*yH2k$#t9 zyH=gImzsz81DozoWb+?pJd-p}vemdE3;K=JZGpM4w01KY2j_-G#Yl-3=D7<;MYPOi zpcfuSR*OezqLTz(&r`*y`58m4=O zkqUki?^cB#tQVUC_;PRYuK|v%V)eL@l3|?TuVbHDlpk+%D~i=c)ck3<6v~=whozm9 zcC2Pl(l)0nnFF58;2IU-+#CIG?F39Hvd=h6^=E_W>$hQ4B;ywvFYU@?Jx9Z;A4!_P zj^UjkpmiDHb;r`r>OY{@qg(xHRwk&o`T^dR&SdbHX0nCOgJ)O8s`k473_0G((F|h* z*x1~p?9ki*0MNjUfFMG7L3-k7r%EJC(Mo5&k0I#hCIPR<8NCv(a6|N3_9dsjr)o4q z_#N8BHmCaEA!R?@jeS@{0(}0EM@O+G?`GMh0zn9AeE5A%d_ON8A*{RBk2T0_gtqQK zF}_b@ffY8|04otGV}!*r-6QR@pT}eqt5Tm0#~j?z5`GA}I}Z!#Y6rgC?5+YRSgZ}j z+d^LL0m#0ZLGnvl)=a?8CwNZ1zD9dE*Y*i9cD49pJ;Kk2#cKu79G9M0^y}iQvy1e# zA-9j<#+2qH;PDsE{DFJ4a}0G$pQF^*FreagQ6rTS$>)_#2 zoredMWSa+pej-HL7VVcFMmv>x$gOr>T!}xZ+VLGK0(!5)V2mqJ0V8b& zcyj0ge8-l`Ue(RCeMg?C(l&Fq1A1zu$`=i}uXzv$D7OrmUt;FupmwC&j-bT5BXkG+ zS}R$+=M1Zp*M!q$0{Zk&q?C*v+iBptr##XcWm!8&z~e>K}_43dSYP zhxl)^V>ooukUT+#-siDBB|ZTf2$%=$l$qWVR7XI?yNK*HBN8Z=8mcllA=K41wHh(X zCc443ZXJwRDD(G$I~UQ8V^QcGt~kPu4|+z4^^TmnbD92phOI5hh*aa~!BCJPU}1Rn zgA88Uvha|D!jMpnd0vbNnlDpOr)15KVCAb+gw&ALm*783hJf>SZM*z0 zI-ze0pAn_nCjPHnG}dck_V$fb^!JfQiNbu1RX?eJG4297AM`GDoOT+z`w-FNMy0I+bjDf|KdvjkQhv9+C#oW&a0XO9F+E80uLDu$`F zKZWRKa3+p?O@GI;z;~5^g1&^pZDRWOm|-M`Yzm-!pg`i_{X1PL_x#(k5|M~Q4-;S8 zV*tLUTCS%kd1odM`jFbBi5PL1Q96j0tUfQrmV@`6g9u4PosDt(sg*R)MLE55Xp}SnU zZJwoFdQ^r_SgoAI7u450)r^CObQUNngDOtgP4(`gwJ2@C2Vn?1PzbSIA#N9b)B?8j zbQZ2-W{G?Mr%9EG1^+DB0*e;)L7(H25D6lH?H1EaCmNbA@9vu9L|Yg%iP^-KUv{95 z5e<`A37zO+aw(|Gj-y%PL7%#>TCMQaWUyv#E(r$_uez{i?s*vh7{Op|r32enH$Vtpr06QY%rcc%6i9`?7P!V(7XZE(^_;K zWF-eI7V^r=&rw4(sP#&+pYpUPq z*_GQxVWbCAmHYMA^LpH3{Nx{ZuMM)TRq7X+fC{whmv&M2ST--n3y_?Lt62=M-kDhF zMMe|4tG2bIL|bPP#`hw~mtaWuF>i?zSiWwUE??_{)v%BR*sBJB_RXubp2$D zGx)ne4Nerz_$ceR%%pTRYrE33)~VXaPXZ^}WL6R&+2vKlcM3{Mtf#70jBKF0J#T_6 z^wO|0Ds#(TaXqm{tKJT&BL3>1xM>wcIpVb5hovS_laCEV+u5~b3kivuelLipk_eaXRH7BI!sN9*b$H#A_r1_E z=~EsQT`A5-GFm(}HokC+L3H#L^7r=@B}I46@nP}O_|4o4*rHF6Xe;NRM<&)-4~=ixof4^--|pG+()wT;jsI zzUSuYUV*iwFyH^&*)J^yo_#O-Ml6@VlhnKJfkFR0_6dBIdmIWfU#`Am9g(eS0^@{rVAIH%z=4)-F@t1zJ19XY&*#Y1S?pMkVJ~ejw+6CB}`P>QEnB6z`WVRuDvSZV<8KgQyUbq@kjsCszTm}>VcAN&$n~7`L#pbNe2bH zMmszN60;-?^{N3$2wm$K`bkM!uA&z$ALXJawu`HGNttFM7m)STP}^#^&pS0ClVU&@uLh&Ok1QFmSwDTPEqQk;R z)78UR%<$s5Z+%y#_=x*={2)AdpQ04lVUm=W*y~O%=c#3|3R#P2)l_?cBC) zoy-v6fO#DZ_diosKAnEXX0Lb$@h*TAGLXEs2d+r`-YTUBgAxJ-$#Y+2JwnoZiyVbm zLu&*?u_ex^E5zXqZvC+EUZF7bVTjKHx0qPCiv${miOobAJi-HOUgZeEy$#T^zD&&N z!4rtdhAyOTR6~NFctiEiYC+YfeB_|ugAj!hWz#(PdR?SJIDdCaa?Usyj!g#|ID~`H z4)d<~m-48h{Z-rd&Swg=f0Qu*19d3%OcN`Qe&%SPfxy)#R06t3oFts+5fOG#=tKvn z`Z>4GlGYHjnA~tpxAR=l__^q{;|(BKB4(Mc+8+I@G*!yl zGAc0l)v8tFT>Cm8_(Kbbqx>|%uULFZ9C?(+i}HQr|2KvyMe zr7-a+ea#69zle5sxkDVCH-1q|)X*yg{s5R0+cOS%1X)d!EohQcS9(BdC$ogr{g{kr zz4PSjMZ}k=%iQAo3SmJ-0KSKQL@SYJ`BgNyUko+l-eHA)CUlA08)Zu6kR^qTtkD|l zOhhKCW|vVs;j$71rqhWc{MiVe*10v#pstYwp>KS1v-vNatSFSW|6i+{)i1*!-tGE$ zeKTk-aQpYV<(nH;M}%8dFdDP1x2gW5Pvczp1orygkDK%yt#aym3>* zV^NS7qv1iGz&BY$vM_XWm8#m7vTLx zI(Dr+k;Z6xP`et(ObMo~)`Nj}Tj_rhf4SWG1?-J*a-I&QDr*>}zvB->JMI~ov(LHd zSt382IUl(F`rC+@Ytq!4npOtinuy{}jZsyy{@NBWudW%g3$S!B!-}eK!~3&7Zu@A2 z=D&@j4QNZRl}j?{F3P?eO+Wpi#a+m^rw>M)%Iu#I8e4Dk6kNuv#^FsKduq+?c=OBX z1(Nt_wW*2?CgO@(G-A1$Zyj(xRP*y5oqpAY0EV@&_=>9%huk&@hOvaoaw5h@Mifb2 zfNb{|puLKYogCmsH@zLgy0>+eSiC*b9&i}q{YUdIi`H3HfeW@#O*7%d0eWu*Q3hzL zQv(MPt7xk_{+I1TTO6mN88?HfSEL!{@6z9Pa`b!{F&+rt-S-#Mp{|OjH)zJK6CXO+ z*a@7%n{X7-rP+>A@Rn?YscXA37_V4#VcN=s-a&zc6Brrk*I+&ft?+9TDMX5#sAK4m z>N2VNXlRpUi6RbOzo!gTg*CFtI#-VTGv*y158WSaeX>>sci$z%KPwLY7UjL|i^30~ zjul{6lsXRLP+Z>M#GAb_xp4y_`elicydihF8Fn^eC!O^< z*L>&ylB6}Rho+Gbx!NOl4HWSvyuW;@>Nh3Pi=g&QnjtX~(XGOsP)cCF_4lAmq`XWL zhxMp%whZ-%;FxE0uc6Lc6><;IbgWv5uoIzn_OL4_HCVsoD`DC`P6b-y8EVHf?R%`} zAMf&U!cji{U>_BtU_MpA9cz}={;8=I+>>|O-b7n+ptFH<`-0efrT6r$pXT!iN~+X)0o5W~K-j&(5o0uRm!#G%cg(U7`u@NV zo-<8&SBuKsU$Qx{)Kx#|R5l_Z4Sdda4W%mS{iVE$EY5K$d+>)yu2uM6q_;ti9;)I} z>0dCf3$R^_OKuuZ=#;}S?9;%p!OwCeozg38uR`xj&SxaZw}I#V)t=`fvxQeFv%4O0 zDuHMe7U0rf0M`>uyL|7^C-Yc>%z`_%k!omuJ&A7L_kWrU_C9-=ij6+XmEgb*@em>g z(BR868$LeN_mxSFYTkjotDX*wXHYfQ4*Klg$VUjL!L(Flv#|UdA0Qbo4#8c{6uYP8 zkvr=l3Iv2=4uIlkD|N{pb-E!ES_2$EY+TCIKlyST$p?!9TKjce_NUsE6KOoSg&$A# zD!p-yra~f&9}bR<9I=YUvyN9cEjh`g_Rw&9^8Fk!7}6Kv(N*)5Dsks{eOW7W|9WL< zQFwa~FEj12nT^^AwaK3(QpwK&(jho7aeAhxqzXN~pLuK4M5QG}m z3I3s=yUbj}lsyh=oA)rWMI`v#kPO@o6OZm@L)<0#v1JJkfUGKu1|xmt8H&bt*n*(3 zKW>FC=9XBY+AR7_(#YHHj%wkSYweNNEN0}UJnf{G{V`?oy4+ zy9?(r>Et|e*T z9_S`X`}TezsUpNpvT%3gA8gfM3$qH<_sad(_a+UD5!-ldu%x})m0*%90ZR>=;?9dD*)v&;x4;KEuIv{q*(#lhYURPM|casCw!##yj z1D}m+EN2*Au!2sDk|M*Wo+HbNrxwXCd#Tay<4hV}c7xUkoJWM6G+6{mg5R#n$(9?w zMA6I0iToIs69wtSQ61zaV}qsNas`(Fsby4o*l^x?-Yb7+)2Bxz=6UWK91jcvTSbu8*35i0 zEGOGPA)U<6UCQ^>AXCh)qJ79PQE4;U1kn?NO6&;xhLpv6wI`F;(L+B5&+-e5cWtkk zWN{P%f=|B40%9yX;*`=SxiVrf=UCmFr}GfH?;^S=2m)iyhrStKAay*k&OC>A#t)~s z@4qhP#!J}#2r*($JA|?HJ6T$@dO%{a>}Y9+aRYE#<;^n9iD}$fhxZE4e9pIWoFigLj-1u@cyWc9az2WXF1CR?3RWEz>cD%UDBWBt;1) za9p81lneYJW^BY|YW&0wRF2#~vwlY}-N*_DIX>yw!F>oSs5;`#?zk3~A!qMAfZvlqp6kpbYm9g-23-Z&u z=;jwP!818B90?0TY7vLUE)_45*@_4|UOG@7@el5|G-`nX`W78MI$QhR04PXtw7M9!*SXUXjbhse-?6D;5js}JBawO z-@~?9^ew3Xch<{e%QOpr3kwotlZP4dS|9;|5$8R`_-@3>d_9WS)K8OM(DV~6y5f*I z(Jh}LpA8RO;uvBOM?E{ypIN6N{N}>!0H|k)OrDDF$hS;;9EglxA(?faRg+u1?y)}0 zq6K#jfBB#NK)UXLI4P3!?S}bCyXU_AFvxmGAO%O^KSQGvn__YfV4p!vIcI@Ou6yX6 z=>?G79Q`VZc-I>S&LW>FDPas)yCS&8FkRn~5r7DN&Amdon<^45#_?=V+va4NDq-CF z34gNq^=P7W+VDACHRfw8YX#f}6%e?}lQ04(vlgAtaSz-g+D9H5@l&20SO$mw;tK<1 z=BiXsGZ>Y`TsKcIIOiu%{~xG86!Vtcv!wOgo^ABD+ELB*bp@gDpoWvnu2BV)L4HC% zO5hcmW-SHgLM+v>t~}z;Oqa(3oPzkf{wngrwj(amDN{ipXMFuB1-Oz(ESPum;RkBY z`J z^gpH4mKLt$_N;rlLm{$$065`vh}(mrKn=>4roX@M=YbQqVKzgxK%miB*dHtP&sp*c zVD!&ZiGXPEkCpGCzlld7nhcuU{S%s9gSdP3Na0+M%Le|9Cuz$)iVBsq2t%Un$cK%8 z0SgBz4QWdovu5cdMT4-Km1PbQjF}q+vw8O_GQ~t!hi#Y3LbmZijp3B1)KTs8Y)I`qz#Vs7VZMQ7Q{eB5>H39^}M*yTrBp2mM;-y(j z2YGy@J*{1hrjPb4qNa57v-M^TDAfZMBR6e2N950qSZFx1jjHKr+E zUY$1)n5r^uK%uT>S0Y)4_%}1jDK@<-=iR0O0GY^$=dt?pw0!eXM~{32Bmq^vqmMBG ziYmlj`KZ>PIvizvwn$Fzd|#?V{bD>?oiPS=#aUdB;KQu#E~vwtB4rdLoq8DY?c?DB zH)V6(q3B^V@}rA)+EWoEkYd!i)Tx|2qAMk(DOHgqx|!$RO*Km)+xztCAn9=r?y7Z* zv@!5i?~a=6Kmq7*7?d+1S(UM-FTS)-`Em`NBDHJg>eDq8q&XYnhv9QF2CSIqj zzioXPj>Y|VSR8Ln6t)gZaP(w>Mc5LDlhZ6dtnn`utWT(__WqMzDGnj~x^3y@y`MX{ z+vI^h*0+97pQZO!TdyqGY5kJ3lt`^^k*wk7`*kPmziw|kXgD9JWTPD;LVcS@ z?+%KU6=Br*Df5hv-DBT(uVSeml+vk>Cq0^od7S|>|F0a*Fo#c*3_(Y32603tr=ftSoG#WKX_Wp zqI?dQpd4v4NBKbl{Z^q`dlZaKfGRZNl|z==Bj9ggQR6G)v?Fht>`B!!&_VVAFH6c3 zMrc$&cl0|f5P2KQx~dI`d)uT{W7$q)fv@UO2^9pH(&rRTv-A*7Emi~D#E~X_ybX4g z+uHMgFs-imF0mQMdVq?%T;#LOZqtTgnh3QuQw^Z4012_4P*n}QflXD?!fp-L1sy?I;Uh2&AQ|ACHe`bKJHt4DsVz?DI6ANO5X)2H*|u(# z2f}$NQGprRdu|2s|Cy5yG(01dYP5@WiiINd{1e9I?FVd=_@f|S#_Jfsxl3h+8F&Zi zqt0_`)WYxTPaq&jSXz9pQnlH0XSo~M<(1%}L6r|JFKwH&SwG{Ya&bo3lFFVcWOT^$ z!-_7!+!59~0 zqhaj&FBM2zx=6+FqKD`su4C6=Q9+8z?2Ncx!1@B(tZ7=~2^tEMlBYm1g?u>}t0z+q zG0Aaf2c13z_COV0JZbMpZ&DAg(WM%`2sDEiXNtFOpy4^@FPp z0aQgxwK`0)68+knFsJ^sbi^V=?@2g~qwNV$<2Z>&+p|uagou*5i@1mIgvk>IW9TX0 zVK=?__c-i&F_7bIzppGsq$tw%?b#Z2!m0+Ay}1F>k~-F4nX?#Z!KJ_dRz-pc+HAMj z&F&y*yu$3%1`A==F>yzXnI5^2r1|%QR(L&AwgVH+;p(mlb^4Zne633pCef$06-rwa zyj4!xHh4D^*RjKH+AC(249f0zDLmg^r;nusm*uDRH5QWI#p*>|PA=OP#lIt^;k#vD zBx=D5AovzhyLz0~Y&LVbfGxOG^$l&h8 zoq8u^VgN6Rqyk|7Flb?nK#VH`ax6wmtuwH3!pK12b~!))h?|0?R4WJDD*LR ztkxuTZr)p7?pdoHo*F9#618$~EXc*jgfdO>uJTM^BB5^8Y%^s4xt78{RUUgxps+{E zzTwfq?_rfu4aZCWw&=Dux*?VEYspGb$xr~Po@y*Z)`ZpmZ(7#Z#85M?FJ6od`}Vt2 zD@9>elu?3Qvi-9f@iDTrMB1!f)jjQ#Vozd=^BF^Hs?}KKgGPWBsWe!os_n#gIts!k zzrg`x5M|z=%J44QXoMgTz_-S0kg&zN@V-@;#*VTWP%PIcTL4O#!^1Ao3_>doyaCp%jT=}gb2NqFoMufnh#7x(vCwwhnc@i5 z&^)Rhes_SZW$gHuqD)-5=4F392%IV2`y#E?Ld3sLAbCJA-d(jqzdvEPuWmxZSWWm< zxjbC|*#?EJlg8xLMx988v23pGaPDH}2ycQ0WWEj;7Q84(LJq-(^zFo33J72!KbPK4 zOhJI;rMk;5OB&a!M(J!9d)Fw?$#Tr<(kmg90=YE7dA$r8Y_&aAGS zQ;R4`8Y09*_ZV>l(!ogu;WYbt*FNYv-$MnOTjVmGO9EF!nJ&6kQ8?M`y47F7&jXb7 zGe+@7AeAEX;wrU5SCr??XiB+xBkp-cu#f{5r52rp5e2Qg^J4$Y*RQ=G$5zo4`Qx|sJ?p&92uL?f?%Ilut>)zWMcH<$a!w9F4E=6x;;+t9k&4!3OMj4C4lra6VMb7M5dFWV%z;s zsol|x!#@dTy0#R2O+n_|a|#Pg{v-;ahSR=&>Xm zu0qTlIJ=bqZd$YMp>CZQ1cT+AY7oz{H{3F}LiO+=p)0ESyn=e_8qScgW>nI{k`rye zV+Bd9hp;L+3ahbtohx?S=2T?di@!L!5@Il9x!2$5pz>_ST-xRshFE+>Ug<20C^svY zY%%Ib>=bYKx}d|M4zmi%*9Ks%pnB*5;r!_d`c4K_9a-qoomO|xqoOYAq_i&44UONi zjaO!Tu_r*~>U~hUB5(GO_%nyV;+e89mAhg)sl-sKY`U8+6tghh*%nr6X)=ase+WWymgjXFo7ZvU>f%Apa^P0e>vYxk!R)w1fw{TmvAaAK)sHL-{tB$IaryxfhFL!eAP+peI`*RO4FGx1;H|r`b zV&D=4#d#gS^_=c)kcQ~ro1PPw`q^yVaXo_6F=^QKL`}&BRe5C0YfNHUyp=xfVz}1t zbR~7^yh18<_C}MuRZcbPlgYAUUsR`BZY5T(zKPJr&h{YKZWPusxFRRIxBrNv|_!$@FmNnXJUbT(%goHIz8k2%4XS9*^)fsY{I;F#3{eL zIGE+jx=`wuv($h#2Z_(c;=NMGINKXp0juDBg&j(#7UL^Ajw72hnxGllorp3J`QU33 zYCJ?luyy(rx+12+whnj+&o4Ao$GLH)y~Rxjz=Er5`dgye=!n!qgJLhVU(Gfq@1B1) z4=d9-yg#%Sl`1mLA?%8($gG_G5i3CWeJeh0LQF=jL?J`l(Fum8x-ge4|?THx~Gd%-8om~=jGE*(~R+Gqw6?$~CB@HJ+pn{%TDV|Ae+$(xj^D(RIubb(+(U{l5BrXC=~cm#u}h)laEd;L2SwnkzF)PGKql;CK}O^PN2NMwa=QY!=~nNo-UKfl zQ&c)>d$Upz9plsTGQ?Yf7)4mpwEwb%7Udv(Prt`_i1i_9^GHyxg(u+*c|xeVW^lXa z4|dpJ!Y=C$q)}$?Bi0c2#U*5M90ePyLepbLrmW=p=1CjJo`kzWXzXZ{QiiQDK*1>g zCk*V3Ni9NsS|Iu7Bw9eDwD;2WeowWPk=*I+QBEfn{zUttKW0qnSdt!x<2H z&NWwo0G~_E37khUG|Qpksm_>g$Og z|4KNnSVK7?!T{#4CQ-DYDtpWSy8Li4w2F8uM~VaHeM{8RRF|32&r};m*&^p7JR()$ z5oD+vTRPz5x{fXPk^3G_q+wD|bh?ln1 zhLz6cq#Io&ZYiKo9@q8|Smkk&U$gf0=WtD}x}`^Tia`t+PfYWt9fcrrM6?CnKc4;Q zOUY`90-2hKpyT#>-z3+n?M05C4FRpSSi4$2lu3FEu#{h6%+xJIDrFt~U2ax;@Ve1H zkYq@=lL2szX`jMtCyI3t2hEtKXb=aKsNA#|t#Bs%1-i8U93cwUc((1)SL9+iZ-p$B zLmzKjN%)$F1x!e;3XAcbjHD#CNnn=xfC+|Z==FluC>bo~k0H_1@pcL2n>TcdJHg2>*LC*|{QCC-|3L@=a z(wQDwy8OwWfeQ5lzzv(#><1vinmlpY`Ygw)YyR%Bd>^!#I6fC@N_x)SP$K0&2Dq`d zY*GY>wL_2Z_QP}U*<1-o$l7VAwYFxTVXUi`{bFsy(z?1qJ=EW_hxIBwS}2>9*_83~ zjwnQ`+zhKdE1ChO^k+{x{{6~P*%n)%nur-5(m%D=%7C$`bZ}M2u2H;Y}nVG6{Qx=5YB9j z6&m&_EZ|xh$FWBi;DMm|aM5*0Y~e@IQ>O_q9>pcGQ5TcxW3W34I4Y~3F3oYvMX;U=n3betR}KrXQ;%_rK9Y#?4zN61Y#@! z=3Sl~O>X`V08>D$zh<2<7t}!fIGrPfbR|Kg@YST{X7-&HPgdU(Eo?-!zKKGBg0w(dmok1Xqsl9#V!!@y3gYqACoj=aATAm3kbD-YrVRdo_7rHbvyNs(X?WQ zq-(&3QWlGx#wf1>ODX5y7vmqx#N~)7bXyTO=+Z5d%B_+|#^h*?itkJ%+kl5&vJi41 zG!H++$rQN+q}?dgu2K;?#2iyj+KuoVzr{v9uJdYqh(39`_RoFfW4NSl+9x;^gj|+z zGZlhCxSB|CT(TB~eQpGXPDCAfq#=X$Hx#qv3D#VP7jt+QU}^Y?J{8}3{!EOHvV3P{ z&=d;CZ+5Ow(@!ad;;*`mm7Gh36ms1`$LDv|L9O%=Ovp`|OM-A1ZfrHr=@pEXs|W;T z@|HjLi}Coy!js~c5>4?m7X^5597JNOhaNR=!ecX%x2%&8nR1hJqu_q7L9EV{tRNaU z75H}2&nw|Oh51x%>LlcfXuS;HvIpBzLVl1EAr*%6d>mXqI;~n(zt&T*to}@hgC+GC zB6}WzijIx~xllplsyx=%Gix(s*_Coo0=Iv)HrECHkJ2d>^9Pf?Iu|8@<#Fp_A~X?1 zgxQ;vF3kaj0~&qq#R7kl7M+DJnm`z14@yVW;D90AM6M`> zYs9FYG@nyCBTh0})}DJfpxnPelmYnrHwnG}pq#)M4g{sUf4eI@;VzzUrr@4lR5G1p>$_?^+BR*K-8Qs~ zwsV!hy_UbgdS4UhN-J>s=-rO^Hy$)y9Zfl_vHjDR5Ttc122l~Fy!xS8^ZX&aJsuU1d?)(hUx94jVpEAfiGau> z<6wu#=o`pp2v&nUa$!)Qu-s(2fn(5!e!Nx94WX#tl*9tNq*6~+b85p}cxaJrjPq>M zkdPwg(}rXklii+(`7d%PQycBigvL7mr>q)LZ}Y&XN@d$dfE*=l=FC>fHjmHTn$O0p zWbc?sncMfcu7+`HlI!A|I5q`NfKay&m3i;Z6NPef z>HvYMo9eOTuM3vM2DWsRg`J+4u*<$SH2dCibjB%YbB7QszF@^caM}9PyPr;z%qBzR z@dhzJaJZ}-0KWdQP(E5ebtnMFC_apoXM`(Q*isAc1fDuHEko5mo;a6!>Dq0>tFpCk z%jQbB^Y!ONt)%pVCbH_QM^$+#j1TtkmL$CLL+$o$;2%#{*jzB%`nwLkmTY$V25;(I z_bQ#fp$WTQW_hVDIyPSqO z-tZb#htUT}-S6%(bpObLdA6OU|AfnNu8j>gNUU-TzS8a-{eG3EX!*X(`J6j-5axhqc=mJzyK`=&Qb?@d$$E}@ zlV;YO9>U}5S*^??4H2FH^~o@{T8*t&F>KZ@ZIi2=M{d8N(?cWS&$JIgTyGfWyPDF~ z{3yLLgS9jAbdgFm!|)VTC_Yl?L9rByM2&_j?MtKb#A{A8)8C}Vtc5y=V-qWh=HP6A zEtm7Z?f`tu~*Ra;3IW|d!M zslPnkBtMXyRSRUfgr?PS-zD00q2?(irP zySr&(0-xr%;?6!i6BYQnsTtr8SUmb#JgO&DHfM#=9V)~5j{2^2%F6`%VGT*>Fb<4D z-{9T%aROAb11;j`*Pm#Noh7qyotbuIb38iRzhEYj=_(qiSODRHWC6*kVNU^ATdX6u z?6YCobWsL=Jb>R50)bA#ZZ+F}4^_Gqley|Ek;IKcMV5BxtOVie@9tLe$TouzQq&p< zUeKZ$zP4Gz0D2h)M0^O(!d|fsM+R(Z2LG#y8(NDr058w{5jKmM6f4pht`18aLt00A zrO68ac07}aN)`wzSy|$dzU>Mw#;QsOcFA+eVWOy8EgsxD!5^#QVwD|wJ7BttO6`D1 zSPwq=T(Pl!_-rLqFS`a%_#eB^nGQ0)kVq2~P&__XCg0U^AW|(Hp{~U#8#r$1I{4|SBSmYgOz0MRjHphKYs@|kl_J|*irI6y@ zIg~%`YKl@h@E*E^r)VumzbP!vR>mqvWwg&27QO|T! z^h&e#ahq4)tO&S3mwqg=8bnm55RsuMe(3OOh}OT9rF#H;`ki3%DECUdw;zfEikh-S zrPH}n;^a9EHO0y*l(%IgjXLU?UUa6`A=Pi`uFs;o>~1>5aHbI5Bk3|y8J^|H=+=xxX>S3;5b zt5>4aY8BuzYq4c^umN?UnU%l1hl9#gYa5G)SO3a-vQ?K@#~4TZisHY`Q(-Lk0%Ux3 zN#6f7mG;1F3TU_@p{ddC_z!8eFr4LsX-w@iG>`MzKvII~mw+^V)`~!HxyCww*z|lL zT`4;4#T!#*{|fnJ{d2K!`X~`Cw+%Kp)h7RmpXWD1lJMe>-GTgH*c%pCk=0>W?dw{^ z*luarPb-}VQNp~9wATJN^ayEM7Q~-MOL095qR_kv)msol}eJ&WX6$Q zc5uBvo|8|9$JdXcHf#8K*lq7y{*^~IqgTw`3~RhAUJUHh%%Igle2~=lJkv|&(HYE4 z`AB8yW4J!llvoLG0I0oSA+8h!P>@N;o}<|!PSa6`2bsu#{&u1*%C|_hRMio?32jN< z8%k?2jawo-$@)1=BH5_`!BzQGa3)^Y_>t8r6RetvVBbjs(Cj4Op$6$5VM$_^qLhBX zLjVlfx;`bj=m7T6z7{G4!J~QX*(E3^vRG8d4w<-hpk~BtB*xMLL_ljD-~hnm=B7Y8 zzuhd*Uqnsprpc1JX)?EGzlw$FF%-xx_?B@9os3gKFd%?&>muqfTEcEuzH*54w%zdS zB><=RHCx9ifzk}uX7pR!P)n|Jo=7qkO>|$S_ElfZ@rAYzWUA0#3xjT7Oa8PRPp;L5 zImIPUqb;Urxv>|!9-v~3?25Y;eisXw*a#!5s9qErKOa)a+J=ujlN6I`l~V#Q6uAZh z$}oYOA+s3TWBBUO(jS*nM8hf0H>r`OgQDXv?sUdLfh;_E1#w=Qg9EaQe6yxu8c+od z1UHA_c>&c`Fa$w-nKs7as>9Fywv!Vr$jL|qjQ!2bZflJ*)Px$^mH|G1_mM27aLf&X ze{RUY`}^z(4i>Jk!LBaaQPVmJR)v?2D|Zm{HL!I;XL-NsJ9>!p`i!NNHPo%O?4dRb z>tk;W!~eMdJ-EPUhr!UCLzoYtMMV{x_dbKq@xQbmGP>)-KM!GdDbk+ z68%0kS~WtR%&fx5e&wUC4(n44KV{G7@jPtFi}C-qu>QLuNQTeTZ_5RTaLD!1s*LJ* zs*C481SsX`XYNlsCj1h)*HVwqukM61C%ozDCng@zM{!q8vOjwEP+F{ynzR)iPIh2t z%V_YYWDPQs{-e5dE0?J@uC^E9SgPu1cqlKSG9!YYasR4wC=6l=I35hTHHS3^pnZV@?R9NSc|$1OZ3_WchTeZj|D0Al^nvWxPN)v z-AEm=Z7FDazX{g+n06a~CZWSxW<1fHcFV)QN5)ZsdkIc}i#*L*Ebc@O{28i^beP@wsoH7Jp?&}1jEOW5m<+sMk8g; z{nszd$YxJ;G-_l*rH#atrBnX{)$H*%QWNSafzAKD!^p0<;mbuXYp_I(4ybkbFyLdU z!|5mx@<7SlIk2ZFxWmaNju{#mIR0i_So(MUlgUWO1p_RQnjj%#m%XN#dAaxgGVKq% zPq;#ohRT609JlSGtKiPN9&k})YIJvul`%&yQ8k)|qY%v(6tRljn zy*2>H-Ws2ZNO>$=q1YCrvhv;g_jC5L{zM-j>KAeB7`pC9da-OsR#`xBcT2*I$wWR1 zEqnwL8r!|=A0$7Dk56It*|FU5eM}5(9()b<41r?drluoig^HeJ-Wd{V)lT&x*ksb@ zILVDYl?TW@t()esXaK@8dc)pRVs1H2To?qBRtLcRm|O;|Zeg28kGI!VUeW${Au3QQ z>Kqc(6-LDaJdxU1_Yw9SjZC1SPQZZ#>Qa0IVP)~XVB}0xGzBkpvW1J8!i0l2w<;Tt zJ*^_;$$Jx1S}4Lk0zd*qV3D7`g5X_t)qE-v*r=i%YxA3utE19hKA~o6DR-jzy8bv7 zIp|?H!oMH2_4%%&X2w`YKa(K*h!+GVRZ<>H2<_KuJlDowF+oVIX-&`sv#be7tuTap zNbRhLt?v*6LYt!QmitZAB(&c;cxw+j^|vBC-=MA^F)lk!^Na{G!c`vDTiUVbF28B` zcoJ!r@bj?#z(q-dv_iB&XEWnkb7QA(sao$a5_EGzTkFMCTz};%*2gZEO;8X!IC{~2 ztxmO1VurFX4d&QV_teeomN)LWeIfG7Q#8!UX7)4j?sSc>Bzdx`^d27;#e8p~r75w3 zh{?g}Vd;7kwFw@;9YW^Q(Xu4hjHBbj8^?yD1TBs;9%MOJ5sKrV7oPN~kthfVyNp%K zQ{Wo5zqmQ0cc8R8?HpBe1V(sE07YNa@*iu{L^HpVNsDSKYRShlKQoY-KS^l8+u>+S zm=HOY$9?nF;=l;ELN~plxvVC6C#Ld$p3dobeU1Z6&ILe2kN#s%EBu-pF;@_)9+tw|O)X9w*#)E~o(g-5 z?z6Y%jzLH3!eT3{$w!^O<;V8&0|mg8j?v-UHbj7*h4lj)!D=N6wezk4O=gQXe z?B0zeIPM_s+MUzEbpyTnmi)8Aq=TI7gC_BA*a1+jmg>Vc(7u;0-oTjB=R?n;SZI_C z`9sjR*0QGx(8*={2OEbg(e=Ln6pNqK&qY~HEysA;S0dAyMvjM=Ri>@J3nnTzA0e9N zn4j+B4mXG1>7|T@9V&?pVsvcNl6^ z9<1@@cL%2tK2!WfXTIx!L*At_o~|f!^w!v>n6=Nw;=L}%R*eWJnpobrifVv@p&7sO zX^dh!cUT;u9(>M0{UIISj#9AIXQH>H)(lCx^EZ(#nBdxnWpO?Yr$+%ZeLwD@vo-280jv_OY4%UhH`~?qUz!s?v5`sDZt_X;^=E{T;@Ji%yC+vlgh(~t6@C~7ZM3CTU z1{UwP{9q;Y2KZbh9}IXZ?6u8;A3nq?@~L1syf zsL9*`iYC^^bTyglkI(F%fteu#Wv#1aHt!q8R`wSlFO%BOkPZ{So0LUeXQMmYiWfzF zPA3u)3Tncm=l2ovh>Lx>ED=FNrEPyYy?s5yGXIOr+X#?Xm z(8+=yWre}KTosh`qQy!p$}Zm3a}4AM2{jEC|A8cAi~d1&^a9K7e;t#ee%6cd>dXpY*k_fif zFvbLcxHsk-4lmn_SQfb2pa)$v8PjeX59=UM@ayjslz?;i&b{|h?u(-gZOpmj;Wu#^1e;jC-ph z>#595o$0~FYAq(i70}qsz-neZVp|#!E8xABd4P}xY1h=OGWt6<6yP!3sjm1StpiSec2${{hp@b|M;Nmj@>o#2RZjAk!4yj}HhYKkqXo)wY|#DHzx{Tu z4Ztjk42qlbfkw91vbJQe1vk~3HsU@F3d9y*h5a(fb>`+uM%1MxaDu=e*5G|8hk+_W zQbbFzb?iGlx>cARKImOwB6~+P9{+5{gAOT$IZy|sA@oc680bh~1)U`)=6YJNY^k&D zR6I?Ti_XuFzewM+pP2PPRuKlt?wuV1h77;pfX*@2S7r^Te8yzORbXCz3jAvZQ3GYz0OebaWiU#{wbNw{sXwpnGI^S{ARQKS^gw-a>Hs+ z(3nX_(nQ?gGDylfu^poGX~TeKV*GTq;-er${KdUHo;_qLL#L!s<)i%MJ<{Z$@J3B= zxvyWrFu$DMOKUus;pRd*Q{TfR(%1c_+0$4;r~9IwAPxWtCzD9^-d+$Zg0M<6P@}Q= z&kjnLvOZ!ky-+1yM%!5~`QK3R=&Uf#Xg5LhDdkQ%KEg!aj_sL?ZsB{~(j)Y{8&%*caEj>LkLsY9|HmriDk30?ebsDs zfQ&i8c!2qwjkgJ})Z@(-WUoIb!VWs`!%r?@NZ4mQgInltJ)gqP(E`EW<{92wo{m54 z=E2kPf<^o%pOpril?>43$!e4EG+3qWi$_lYoSl#O2fo!~U9^;$DiU)b-JBX2a|#Kr z9TsM}zi4wyFKKBG?RRzRzvTQULlUdFM9l#V+Dp#B5S3qyLdL4m`M}`VP}NhbP{8(k zfP$`Jcvn#&kKLHh(HPH29Ebs|K7rom(Wj?K-^HJn%j55E&8p~ziR z2^lK<+M=%M0$x4qsRkKTWhaok!BS+~M0Ewnu(C&rJ8UZq|Bxy&QzC@rkYmylg8X?w zQW|v#BN6xaF%@cjcPzDa4U22lujqxUl{mcXdYI175$yte)^MIok*ioU5mC*0XHH00 z9aZY&^>yf*CpVfbH~pDsRhI1k4t4I!C<*ICg% zIjCmAR>6(Wdb8HIE>^sV3{k1(&_d$4O(QT77x}$U>fAkOzr8Y{S_y8ef$xo$52T2J zE`kE%ckRkvtK^@*zUY8CWv^ty6V9K;y5tq(WG+M8GJM(>-?Aks;f%=XA*y6=7l;hx zLt{3`4Ns)JxHlSTidP9`aGjq{)G=2+y|&bv)%6N_AxFSS=l8a&UihsleT$VK*u@KsQ9HaSqu5*R@GPTPPDm_Cz3Bq!MpP*j zi<1?zRQ5c-p8xqG(9A&;rC4+&y^O~uf?{Kbo!bIa%u$yNLTx=hL;yT^2#3ND z7nY?XUpF&&{M<T~)=aD4uu z#LBtaca0GYR>s?W(4skoR_^$S+Jv~V4sz^o25wkFrrD>J+q^&ac-uZQ`5G)RpyYIe zO;7u{{04V0k8}*>Sup0$0XbSg|v3g_m7G}%;Iod##I|5`m+Q1drfn(n9b&||CF;Rsb^q$k%?l(m(}G8?BUzz7^khZ8$LIAg55I-14RhT z@!+gf<}7t+N5l7;S}IXA&=K?1u5o{A5oQi$&fp0`p&tRF^MzCQ+&rBJ5K;=floMfI z0f1Y-TcdNBAES8#q=x=$@mW8qC5z$>@A|~yiK^mrx-uK26s}!A>uc>LR0~#K8tBA5 zn#a-J!pKlqV5q~Y9`J!maEnt7iJ9z)7}?s{H28l8G6*uPvgXUtW|{wF1y4SFjBR(Lutrdnu~(-lV;27Y8A{{ zsDp(g6H@0-et*ICo)vNN#QMy<@ zTle(!#X?py$smy3ZA)Y?)0})yn45eYdXXOOGjpn^u$#rqqA*@Shylz=&v9v&R(V!PNHQRfKB1*YmIFMs;Gi>bE?BQd~viA z<@4$cf($sR!#GaX0r(X!H^Rz|q*aEvjkAN78WQk^wAJ-FDwl<2W8U*fjxsPZ?W6ch z_*aRo%l>o|(Df>0>7T~T-~L{Tory??NxsE~5^YB76TlbT`wia5CGQ(r=14w?1bJ6k zi$Hu=yi;X_c;zZ{@>0JgTJYQALMnx&Jwg@!)X`KMsQy2NJ1#lF6c0vmpkoSig)-ly z)iE&+vDU3s;hAM!C|N&RQ8Za^t;AIi-y}|Ywdklk_CoU=;0y=ip{q|np(`FXYn=w= z&lTZbvQWx3cPlS>6-=t>3<&;Kcb;OQ?kKoXM+FM8b+9qM=YtXiFt&F#*U%esj?pB> zFth@+I>OX4JAxOhzv|?u;q(E0j@@1Sr4Q+(<|GC{XhPEAgt%G-5!#9@{!HY-7()Tv z000V6K>`V)M#S%7a3L+~EALcDRNBjYve83mZ?TPURj4-=M>EZ>JJFD&$q5KI{R}V7 zjHHs@fG)~ATfI6-eLt%&s!d^T;1KwIW0pDBzOk)^T|?6G1*b)9y5=yv$(L%Xci#`R zOI#Mm1!hz&LwIb&6IgxVL^&0V=m%Fk;on=ZgWh3&P>TA#3m2Omep!^5h}IW@-p7Qo zu^O0L|AhT31`_n@^R(J}w;n4ebQ`u=&~OYqk7q=yoxzib7I@mMdf@rV>!8SA(R3>c zGTL1J6)2K6TLnybN-R%x1cY6Tx`C@wpu)~#MBcJZfLoN2TGCz8s`9Tc@m3e@kti8m z;8kAZ;jXLM$zn|_WyOL{F?}3nTmg3hY!jZ|dk71+eVL6}9et!#xi(%c@3dOn8>`mD z_J!!dTS$I_)K%PnO_yjHRQ=DYMw7{BwuxG`6cYvFXk5O-K`a0^l^}hZ$5H`h{##sL zsR?7=3$Wgjx2G*$w;V#oE`RaK6agU=gwBGR%SrAzpl>X8p@Qgna@i1s2OQg>QSfnzpc(M65==Y9+6lFLh!!z+?7XiG6I04jt!Ab*GpjhH zhn}?RutbkojQvrFlLdfEfJkujTd`->rDV1SpA!%FmZW-H2Io7dtkFXB%jiBXNPEx1hP5*aFP|?|t%75TtIz!OvH%VvBNWoE+qKj>|@+mv36x!WrP8(vg@@s|}+-o9Bq;qr$8bO*7ghG_GG?vspxF z?*9cInjtFMQr59d8DchxY|(dbh+1M#G29yYry`EBvaL@M{AvnG;Y9}!lGxD$M8|Y$Oy09z zpeU`*Q-gRMZ*to^VbWbZ;Kq^^Fd@DSGTM^a_U7D1k&cG!x8c4o9RdO?iaQO-P3yEF z^uY3u3&wK&{f0o@(QTZ_6I3>`ZX8Keg#eJ-aW8y#0{}VvM&67N}UBEbWu>2S8N7;iQ$mTKp6vQ8DICHvoIF-UZhIM zrSaoJw3e;*Myl3IVPm!9y26va<$bYLRWkK*L~@+;zTT0wg{k)*mTgtuHtf6OgW75A z5FTIDg(-x=-VHGfF|TL@8f5BnF}R_M$e@M5p6teYkfj|e3J@4&+GhgIH1fFdpD{Sq zKk^-j(J@7;;Dhi35K29%w21efg@Z-GILU;>pNUm$u%ZOVfesVvQzuN|17kNr$nKiqoo7BoA+z++`V4j zX9cBttu}zYSvs{b&_Ackn^dB9U9awx(?M`EEy+lF2p@JfgyW>uSh}t=WmS zRXt9adE6HKjEZ>Fx6^TZenAL6DLmrG4Wx{v_a@PwO^AV>fB`5pn3@wQh0ur1btS#(Lr{fQOq;nZaZ- zhtL$!hgfP;(0dOTRvkPx0NXs*ucZ&Nh0~z?*6-~Qh`ct2I)1)$idXGZ4YjMWZf6+h z{<`hak;0h9L)C+DMJ@<3Ty`tEu}&-yIY<)8dWF(_VPOl$+?^Itjoz5>X3(s1H6^P# zQa-wMk*>{ibwTx8@y?L0?$tT+3g>LKI&N>mp#oS>J>2*TNG4LgoyhU(F(qF&_eBQAbU-IRjZA?3{iA|!`rY->eksv!CHi>2e4BBge?-N z;87zYI*!J;sU8U&tc5O*bfLMbk}NX9-dD&X?Y3eE>4^XT7Q=OYhB^1o^LAyNAiYx~ z{z#!BPo{d5$e|`yQUhpPMO87T9jZ@rE9YoQA*(LKcnlNOBZu+qkxl zrtCpX=Vr#16I-c|dgvr&5=>-euwVDF9~7>Ow)D3uC>SMad2Vut^{xP(YwG z+IY+m@?jG<99I1>p3i(T`OyNHHM4|kT4?kPG{CAZw;L1StWG6(i@>^UZ!SMGUy>jS z;6j^LL&5$s-Hn2QxRwcLGb3B*>>^&lkz{Pp08L-FZfy(aaGH{;qMzwJa2k3vFZib` zpy6s<^ifiQ(#d2|ZP*(7zYh9L0D}BuzxXJo^%(;6N$qWzRbeAVkX%vi9b5olA-%XH z(F+E2(5$p5T4p`-Fhl7zOX|ZA$G?Y3le(%O5T{SERHU+aQqY{)mVx{`uQ_vlIvqRL zeRHDPR+gK(y;5Knf0*jtq{%BX$R`h>`^x>)`1cqTBp&MK$xKMa6TVzWB!AcxiaHuY z#Z-1rD&KwWS@~bph#0x4MMGKEGw?n{kEjoj-&W8wFLP?}SMNGgdiB`^&N0ymALgpx zOCtimG}ZU=e(BYu!tS$D3rO_UK22juzn0y6gFx3*!}6tHcsRkza^Y2qrT_p3xIqB| zXpyly|6u=K(av;BipeLZ#&G%AUHR1`yUg2fqyz=c-{de?cxZ{=1jGZ{$iSO z%#9C_ukykw__~xW;ZhSbYK*^-X zANFn~orjo?&QU7K8a0K@kVrcUc3PkHqAz`?JAtD<)e;tRp9|V3xiVF(Zq0nPT+39O z0BV$#tyjG2tkzpXk3M{d&w(DxKE7~wH)nsP%jQz?N@Pd9r?w}9P={}v&M=d^%tA4k zYP&Aj$^B7{jVwomJXz>y`G8jQo3)=>?xfZRW9&@CJ(#{kkDxg`H>)xoWlG050oqlj zn0k0w)Kb3x)r3Mf^S^?H%?aTo0P>o14w%)6=5z$O7?@9W??&Clxu4iu&3 zgluQgAkM?E@V?B6ahMt`r>C2{2<2OA-sg$*-36;U553HYuoUD))z4eG7x`}#Bus)*&Sz>_ zT~t$&gC{=$W~An~T%@>K(7f(v^ZgoJx*p;;WJWR(VM zbX&@D5X|tZTFJ=Fe^3^h0D<*qK#AI`3tH)o=dzhQx8`w6gL?UW)>~5}Hs5k_PqhOh zkmc;c++Psv2`9s+`^Fxw3i9&kQfQ7Pb5E9`z+jBtw;lJe92*6)Syb3*5RnX9rM?fP z^C{s2FXnByaYO3=>xIOpqAa~+1-hvx4b{N>uh5Q^0KjV(lfE9s*2dwZ? z6c|psRUFyFMq^$&*FUS>X1GG+#xjgOE40P$9EkNG4NG^*LvDa%RwRZYa2@tV@EiVA zK-$Bc5mI2hF3?h4;1woYuuu1^_r4{3P`LkeJ`3XA?nTgm>qqME^U|ETWH?z2`mdKi zs431v%NJlIG;}Q5s^KN!`}%UOOZ26t&QC_jp}>?GfU6aeF6eE(R>*#<(TF~$iY^?d zdLLV{oCLEZb1I}%MA}?J^BTC@voksF753X#wrWbOct}LOIl2luw)p!le|7JK95|fx z%qJ<>DhRp8cJ+bi5h2`u>aw@|I&OS|{zambVOoqMpko1A%=Z2j2bV&#%$?m4YpW4p zWLG|};vA8px~TQlA^F1<5~~kk0XA2EcSBWtBL{ctCrTw_cU;OOV$O;rkDhPx^WTDr z1eE&?($uoBg+|ktL|;Wn1iSk`T+;b>!dyGsFmJfOU`NB`CCx{<=t zFw6aV3D$_KLFql}aIo04pwaZ0RjLtWuu88=a%!Qi7z}ev6BWLY3RsQdmMC&Yu>Alo z!uL@Kg+jscZ8sYZx79e{p4N3@J)^map73D-MVUtQEUjP@xJxMIMIhy0Z@{m)cFJNc zpd+B_l4Yqu(T=ivs+wvqjJ!|{Rt8vr9TCZS_J_+>XICLcAJHcfyH`GgHZ?g4V~*nK z$ERR6rD@W@0^C2UyM?gLMOeZikHq{x%BgY9FBQ(=scX}y0oou^?;zsYS!aI6FaQ7t zcR>MyXpylN|408$Zatop+viUo(g4gFRR&uz)&!5U)%>}j!q{?k4p%dWW?Dr~6pSUg z2a}JQzVb79@FwYnoeRF=fg%ZoPAtny{QQj|x{WEw5)4rr%rnu*;H%6h4H>ujD9HA`wU9s1Rl34>7KE zeP}EILErTWUN{-6!^WKoZK_pjdnzuWdHWQPB>sZ@y2{ar!;)zXVJ(?(c6VnE7KIn; z9Er7izX@U|O96bBJWJaaY5NmB>Uy5k3==8}_Rc3&(5272y@tOH5q+@g#)x0@v~a-y zu~qUEt?fal@Ac}q=&@1=7dHui0i;!i;6GLd<##F4yIllX7&R1#`iG2KK)DY0Q;y$h z8@Y8H?t(44GFMLm3XkqyG$&Lva^G<0ztZU|i6I?%V_7eZuH?QkvL*@Ylwu%xMG-F|0bxWbLTXU1g%uS4+bhrnFMu2bR92_y^ zrh|wIrGDt<*0nw+-)!4Lxnx%rj4<0=S-*3Q1tb{x_Pa_$3CtS=v1N;PPMm1cvA<+) z8Jp(;f!5b=(?axXF3%MBY;iXgt|vs08EyVmXXaWX^up${bZEB?8*2HZ#gpV)Xh4~^ z^s0Pd-23(Axy$C<=md5HTBm-Yw=yTOOI(bx|CjOnbcxfTi?ScDeP1f)amw^|OlDi! z(d5ja0;tS2o?(C9igP_}sJW8yvU&#Puy2=ms1|Jwa) zN9RI4B7#%;AAM^W0N9ueV_FPQxgN+y8J2eqKTbim-XIBycC0QYs)e3RZOxRfF`@Uj zbYF0bc3HpAsI?W-)&hMGgSFK0kRSs*_rkAXpyk)Uupi*l| z#kaw~Tmt8B53`YT5TS#Sa{f8(dJoC;AJRsss>N_IVIQKD015^VVzc;BLViZ6aPAlc z(#9*EXGZ4NzSgt+0Q1WhVamJmh{(l9Zzl1(n-sZIqmwK`@nK-jSx1+2uhBT~n z_Mzc8YdPdPj|>>@nMZvJ_TmQhymF3B>PPGAPxdEivs!meFt4xThx+OZHG>Is+l8n& zZ9uRL08n&sZLF$l@|O;i)O>qI5@xJe%$3_J{*aLDYDd=Y3X4j_%NU4AIADQf{%9G( zG!rTXyHNKB5u$qDGHOFI5YRuV&@y3+Lf3aKRJ{9RoWy?*8%3kALN-|KS1;F>W1}!= zR2ojB@HukY?39R6Ux7FjZM8G)3U{D#v)wA7KJBjj5gNWhX#Jj zm{*RHIkbyDXV%E7o!)r02r3Vnray3?JZaoGz7QJUoz9r$m22D6yhU`0rV!Rl%Bi9n z&VpgnaZvS5fxS{JqqQ4Y)G^^!lYZEXJ`*|A9kTC!qekaIaVDs*mdY4+5ysCN}qJan!;jmwhMrf~7(1W@XzY zn86B$`5>V5LiVv#_gO6JA&P9@W! zqWwfRVoNr!!iIw`Ur(IZeoYJ^m)SwPnFcGJKLkvnXblVieyhdU0009RL7qZ*mH+b> znCc$I%#kf%9JBvkTE5??9zOU3=Lh9waa2g<@02OR`E(!mx>>~O#vjGOP z$J=*Mj%$scHkCBy@wHcSL;NgT_YYNS%N&@*zMAb?CgIiG0@llnE z4wQ8rVFPe~(MhtRr78CFU3xRC-I$j9uG8ynbNMhy-ZrDG2qDF<=tPgO>dc}bb3GYX zl~Gum$U0gu@dU$kKTa7BCQH*-O#XsQ=zuPrv3un>A*GEcSvT=8$63;u*#+|@ySdck z&iJoF0xFD6pToa&MRTUt(VU{&S(TbV`RMrIYj&{`JPHZETSHFQamO3vUvb=4_2NE} zt-FBJub>x~Fu{448AIC<=FLU`jf!o)k=Vj;htO2{ndB;4_eVV)`WnCy5`KAO{^{ChtF~ve%idnHdCVKKF-(%zDoytO-Xb(#jO~(@?G< zGcvt-MHCiI3W`F6c@p;%WXE?FQ+OISiNF_i-vny7ry_`Nrr*$Ix<)i|9m<>5gL16+vp2jx^Zd}m=THW^&~G{3hi8CE4Ds~m_?xS5 z@%zFpoR$B)o}PK#-^o;f(xlWTYKP^{#KwS?d6IavN`m6$$Dw3P8C040jyqG$HIpnP zGaAoFAw(`u!Z#f207W@f{<$%z13xtjR3K(D1g^zM*K}=Sz zkN7XAQzmXV4aT`Vybyn?tk~^>+0$05qkRy96xz>5cegC}KZSTdq(liqm=qxwhi!5X zcUH#JoZ2*(XvsfMHcQNXb$76URGg-~P09W97oWVlOx`qr<0r+q*8+UoisWxGQdwH( zfdBviTR{N=c?sf2|4~$$R5!yT5}C;Ai0mo)wVx!KJ6j10ar2YldNmWt_-Tm2WG=&h z$rA|MOF9vV5u^ff!FFOFX2P5;1oB4teO5oc2s}m?modGS>?iOfS)?{w0001+K>>n! z3F4oKLcT7fwBMSWG_vB6hDnl+Rn4*jnbOF>*tRAFO%E(i6~>80;8+!5j^Yf_%DZjF zzohWf5u4TqSJ50N3R)K>jhwI_sNs{IJ3PWqBwmGB0EZ&hA~GFZLGE2$t^2QU+v&=6 z8bQ06|15Ps?u!)rzFuFQZ8W2O=%Kk^J$2?VADsQRcW(9~$YRBcaYp8ZggG4dtFUygr+?5vEXhl` z7r(M$7;)_`o{FOB8~}7&Wulkop#^N)e~G%xNV3*T69*sU9}L;<&(0Fpp$ zzs&2tnK%~~7q4W0ef^%R=<$eoB0aU&GQrADzK!Q`?QfGv)>KL}9GShT)p~zYb;?Kz zB|2F`{sH8$!#Ktt4Hr+S^XI$cd z{kY?;dSn)b1hT8EJWfGs;bngKj@d*sN!4!a9sa4yV%ubUP=ge7Z0MQF9LdL!Vu*!> zh9(RnA0Z!u2R|jDQqBf4K(B7|;q)5MNz`gNvPXW|ju5Xk{2uIg8}foBVN9FV$j*It zAA>5F)=Lg*SHZt8hwqy_YQoP-e{CRL+TAr zL;s@?j!lhLkOV(*DC+5ZHXY6n`e|tFx;us&?UsXOD%MN>^5RAcEV&_=eW=2Ptmfyj z%Fg)q)0yJH={dQmoZQJ1?()eh-G9_l7HN&Lb0SuMPj3jMA4Xs`)5Bv#tWEivExPYV zgsl99@~CS39=-uvB(BnrCnHW3+CL!Lr-xO~Tn$iou?q{&1o!+K*u7s)e#Z^dFBv9; z5~YS{1C`hL&r~!WXVfCxTUZ>C=$Pdd*>lgwH9n_AW#e~mj{rSS)tn|NfB*mh6afJO zd4{g8LidF+PA2z)G=& z2GD-@@0X9Dsjs2_BIApjqb6gpWf^2mHgjv4sx zmR))jWlP#5WTUnfhf~DdZb!}Z)7;adz?aT`cSq@w6CO7;j>Yr&pooEtJ|zM8%Lke3 z%|Mq~;mA6AH;FuxRq>J2Ix+4~4VA*0X#voE?_=&@>zZmY_Imn*EZA7ca zK_p5gxe=Yl9i2734n zQ}|!-{n9tyB@EneznO{354xiK3u|hGK)u>0sJnGI{eW-f_LUgnMox3@+$nEgHEuVw z2|Nm`+70Yyg~To|RJC7ROfgs&jZRJ*o_nqs%LZ-n5u&kQm`@n=Hok*k4c}w+0vf}x zBj>t}V>R`ThF^HfSflAr%u)|VrdhGzK+`$88Trp^a(j;?!bZJU{!1|k#IE5-4jNvD zb%wF0ucKXk%9ei-k~|ME^p&f__1$RmbG#KWJ&SzNtE9Uas)$mZ4n|%E|4Nwv001Zf z0RnlAsSyA2kAP@u{^7SrmqN;Flaq=unt4fGc2ts`7}^1zJvB)H003wK0fKprsWboA z?NyrydswsK`A0bvu=*Ua5@scSpyh>-^4=fiJ$q34wy&JrcMvHKmf1BadCiW0{*vjS z75A_hO?(co7Z!$9mF&UkduOOrIgU>w_!o6%>%(g5_Z7nN>VGXu@B}9S00=rknr2C; zL1>vw6!CAIt=oQ}0T0G4e^*rIQat^Y_20EbgOJJbE9`nb>!e)wE3|UaAwsdN9Z}9NEAzVw`DW(S}s){|#a8mi9 zIyv64NTSr4n$st5@2ibd*>VF(2Q*ufDo?xTq!K^cEtCo&>g&vANTPWL zeG*L!9|IPeKNM+lW&k-$eDp6OP&5v)xQ9bFea@}Dl*t>ooDf>WHFnezax!duD;C7p;MEa8Hi+*!yCbG93 zc7${L%^=yr{ccgpO$I6nJ@z|-zrcZRCeUygtr|I}B!(wItLe6=)3Elxa8}GhR?G%g z)%-o&Zcij@VDkeI?%zMyVsU^o$ZIgau{xX*1l|wY`x#Dbx&q>7eps79$v*qd-Mj1E zj5<$w0KjNMxzGAwf+yS{zj7V6PBMaq-J$jnCi^&vm4l2Y9^W7PvbE8`5UfWKBx!#B z{ptkf;m*4~B*INnX42#^)OgZJm8A`vzzhgPVR?WWhS3bxz8Rg(=Xq=lsV-b z!h&~ts$dovsYk~QD*n+)zI8Ae7|m?FhUfihJW%M$4I-FD{P(gSQ^>E0^LPCoR`nkda5=&^>jYXB>&=yq&Je` zFfhz=5;^;mVdufsK^-0_*G4!%ahr_DSE6y$O(5gcLtV_d+4Y2zDtH7;`y6!e$n7d=>;4L4axF-)8B@ynD;+ll4e?2?wB(*ii+=l>w_3HbVZTl>> zh=)#aB@iv2f_efVppnyMi*`#Z-UbHDB>3}qhu3Pe3ef6LWs$bVaas5x5Epq&TH;1z zg}Q_9V#I`WZicIb62;fUtPt*moCSoYlypIWOk7DH$ig7X9~m$pY&80Iy8eyH>B2`R zsM^`N_KQ875~eNF-0)I#B!y6`GzVY?^jRN9gfz`E1#hY305!76T;V86WOq6>($8Nl z7f3mHW8*7mFPXHV0z#21)mI|>T!XT+r2Hi>NLC8HgF6Ry;b-r|^$)J-_QLr*of$iH zORrA{%#;EXzv);li>G$p*nNW83D|A>s#YFxn-^M@^>b>l!FWM2z`+)?o-$SUi|zr< zFY7W&$+>#94yOiD478(@wed2>2a%pBT%Op~NW`0i2V zG~z+@yYK|zyw08U-6msecI}1O>4VLRw4&U^eppWZ&Q9UM%ibqYpyUIw-Kmgjcc2o0 zG&=d|Ka4GV9}q=VLjt=53PP`Gt9Nv5V}9t*+SrNIMjJysP8pylm|WC{a=E?tJ0Qv^ zt4>9sY^Dci1ZyVK7bN}Is%0Wfq1ZcTNqTgs(J^A*``%DWX-fJKck3)L_H|9*H>N}w zra~|^f^|t?0itLHUvJD^OB0(FpQ-G}e;t;)->RhIt$MsoFh(*Gr9+CONfT@2Mh+wQ;6)fGLO(G6MW9WIoW5@q@D0S!h8LSUgM;-U z{jRHaySfA1&Gyb=7zJ$LwjIdY%zJu5Y$P`e-?%$l&0B2N2Q~iy1=q6GeSAyTyILE^=A8AV%oX#80F$E z(SwGsknD$@hW&|}xU#3v>)lO`EKk$;u}#gG%wxD4m&GE59e0(NDgA4}#=uL%>N6~R zJqS?H?+;Ds-Ao_WZ1Et|&BiQb%>V!hj6nhkrevCg7KxO>Pb>cdx^J*2#?0gE*HNz- zxBU(iJR@j9JtYF_w_}|h`t(QmS7EhIadejG@6PaT8S&{-35jdX3L8sBrb`qDWll7q zT4@{QI=?;E)mc>v?h(vXt|>N;i>zNg9s~yznRR#IReJRItH%Tu!Tg#m0vn-dI12cl z>;*2wPT>ootwF;b-66*z>8&f~;MJIFd}Z?oG#^#OfDVn$bhynS$cS<)&>Q|TXPe7X z8k1;8zg=F0YEcJsds7i8#?^0Qci-CeOE_NFhYp>Q_;227vXSEGYP?9hAtH+0Fm!j# zAr5|NNk?VOUX^RmhGeTJBq=~uw~nM$(l6N4uk(Hlj|f}t|5ItSPqQoAESIoPE*PJ0 zrPn7+FJ2TSdS4E=Yta-Q#Ab^?bhlio5kxV?+oH^~1(W zxKO!{c5_Muv5do1Hg*-LHy3xVa@A7Rfm2Rs&H`V^@T$SP^B=iC|BSrl#D1O?tZ6*6cHZsExtwET?2)Qwgi)t-3r{2bcOSRwm0*In zs+Sa}YvFP>p)8qZu9N*~)XMghTK^tyZU3b3rEzrZs5CoDk<9mc$AJ7UFabM7KDo!+ zh;aGUwgfF(3TtNJa1pWN=v~}VX2@dkP8x7)-@-GpFCOh&5jw7H>|_`hq1EsN(|G=Z z+I$Y44|#+p%U2OZP4OZzy@s~=c3(axrdM_v|6~ zNKY$l$E~(CtRfxvd=Wpllw!q*6N=L3rZrP=szX>H6QYcBN2_ohgii0*;7oVV(bbz{ zE9-^KGDauf{`N>De2xV~6RG}6T864}hCmC*(o|x=dM*sH9HaJmA_s}CLL4a%&Rq2I zz6&;jl0XIqyuPC zWpm}s0{C@tN2+sA#t7%%BojNEPNyBy(El(iT!Gy-!v}Uuw9uidXs{TiwI4x7i)ndv z&%&@+`H&UM^W_$q>eBK=UsVOC3(Q?efsa46LLMLdZBNxpqI02+`*Z* zF!Ux~JMGury;Pm$MNXY^9O;}88j0GwJ#~kuC{hK-_+=#dnK|KA3bSfsf-wn#9<~@t8MuAtTtE)nCq}6>&Js&z`?49Fn(g&~O3{1jo z0>3vFx2;~P!Tzc3UP+EOXc-KeK71G|8R1*Dfwj$5ZrLW^9-NcoK8oJ>vxFZ5C{H4+ zL*}(p*~^Sc2R-91BdbsO#oj*tS`63yAO5;EsxhtL-cx@Ih(=}O6n=@+_crAs+?>7I z6d+gVHbcA@rC3zT}V$rBB)8aCPfLp#Q zaTO}ERD&I1mz$o>HoP9=ST{M-ufX#ww=qd3?OiS1ZOXpZBY}J*-$8|FV@NOm19y{Y zhJeFZJZ+LIWTfGgU7z%)6qS?OCcycpGcyqKm{HvoF=&ZptIybt=f5{E-NW)|QD0yxaHT)D}(kSWx;;Iu7T}+UYT6ZsIHX z75|qcyu+r6K~n9%!Wk+|HOen)(f)?5Q~gKaGj2KxZ!~{iw-ljc^(eS)RNe#qsasB- z5b#b=W>E??OO}+cdNdVYF-pV-h9^eNAnI}JMi^D_iZDbdz*q4Zd&#IRgW7*|3d+7f zc&C!04Ind=D3A;tKGvFslJ$d*!L|X__XGlhfdv4T12>qpwTE-$oyOkJls5>gLhCIdLdIs$O@RDJOfmHX$oi@>-+A74mf7mEw^{e87rp zgwf>m8*+J;d|lPSB+VN6mW%4Zm){2&B)AK^tso84@2#QG0Y*|`&wBW0q#K+q5rtWG zO8zR#4PjfRxhZ2w!*%DCB#j&NW$l@78WF}weSN&rp(OE&ovy(Gk_m~qAOXbSOB1Vj z9*a~jBSR#{z@=RSW?NSgxp-_k=o_LhBtL`Hz!YUxAyku#`&(d7hKCP-Q&!2xgRgdK zCcoGnn6d4AzI`5yuS$r4YIrF6VTk2c1UyRny09`3npCn_HIH0BdqF+y^G+c%1xuVPzF|_X4~56t2wC2uHPD3a@Y#j` zn@$UAU_FP##NExrE$yY=E`>rC%EMt4_y&=guexTHa;*6?;-V|DpxjLa#%?3j!I)^R z6oM6nUTYmj#{My&WHK$#_z+V_`>{-nK%S!ifU744YZt=7|8uuZX&U|i9|8OcpL?S| zb!2*fNYWj1fbzzaW>Ofx8V#O#g@RdAsG+J0yz(JR0*Yn4ylh1X+K|jd1DU|~j zFo9~;3*IkHb<)1`)l(i0RR+9!<*(;jfQbY7YWE=lvNqUqZV$lMQpK0yN^KT!kX3YJ z7189eqgaL5Gy8>q>FEB+fMWm@Im$@~IUxC^K0OfImb9e%4DU!jdd*aOoSKzn^oy$vDR^`G2 zON_ES2@83?Mpt;N40*(8W!9@1S_xK|lFz#IcN{)UV@I*03AefPcH(A#iE2C zLKIN!tW8K9%TJa+6woy|-FBIniIrLutv?2;VxM&>;j2Ol{o#s|&(R1c!qw%R&&|j1 zTN6LiN(jDhiMU(w4}tgTWyypEKa&Ke0(zsVi{R%DI>QSgO;X?E8^~F~=DR>xqtv`z z^Qa7?%vvV}P-JsaF`Vg1@Qdc7bI4lvMJ5nXYI_i10NEllvm6^OVm!c&K?qv0TSoWo^gB&E#TaR%5~YO?ylhkKK=FN&&O*fI3)! zEW%-v-Wphfc+v&J7Fk0qTseaUs(7+TZSooXMQe^7flqKSxr$J|ZOtBRo9p2CEdl~B zb8dxZ&`$Sj-sS_vs#-zu1JFR8L)}3XSgsJ#@%syibd?oohxsmT(WHGP^4Ajv z6WiDWUGBeS;wdK+i!-+1{YSJP8)w7?D>5q(@|MdZX9s_T?v5sj6 zldep~Az?7C8q*e+HEzf4PZy!N9(KktLN#=~2!R{( z{Fmy7gJl4=^S>mVhBJvHSn(1Mx>*j44Z`^8eA7=x>|=Xs59APt*}*6 z6&V>>$%O3v=v6plb8wHoro=@~6Kr3hG-fYhgmlz_m@lI^?4vT~Z3QKJtCTg5G7*TV z@)BFJtZX}rwprnVj8cQoNG0l?M7$edCE}P!(Q4&FM07JdLz<`*LQ%Jy*do0-h{N0$ z(K(0o*0S!*QM>VX$$f2!a0@2QrvxmIs{y?5vhg7w0`%fKp4)lG&tB{%nE=qYPcUe? zIhniolYY7K7m3+9Cx;x+1R5hl>Xx`Q1TnASfFgWZxDK2!;vHGS1^F$JM6#MmfY>vh zl>A5gC9@|C4-?0+{z3Q@gc8WeXwA@%`@C_i_&}z4rR7dDmA#0&{5f*(=+vl8q4D}N z;&V+O$ByjapGMd-k||+kCf+VMt+7L;6#2+D+*AF`Jia7ygd$k%LNp`L{iNjiZBD~i z@?w^sGwWoQgvS!0>8!@-Rx-U}skI?*1q@NDd&z<1;O=ohmbyyLtPJC0vnOQ$YcoKC zl}9Rfbul^=?uda;*NEVXcdu{-*76wDw2;2%P$A_lxrLUuoK5c2ER*Y3s!f`E1&}Z# zhKFk^nl~ZBYxhK|z&5n2>)ci-r1zZI{16tvSqZo!a#ej$cWq1X;9QQGm_-}Fuh>ls z#k=)Df_v9LszRZd(EgZSUh~4KH5DJV0ru_D1U3;L<0DVZw8(ZEE=_g~omH5$h4C!P zN<%t0zX^G#6(>ITEkFd-uQ`MwK}F2sEyw|f^XvEdF;2GdR$PH7@~)y1su8k`AaTNo zT1y5HFS5TyvkiUtET4l$BtUWj6>GghfpJXFhWB|^-ITU|CaCYV>_;5!1Z6Hu9HsC* zDjL~0VE_ODumPThYH!Tzh?CdzAEcTm@pStp#i38A(ma*2QApC1>o8JSdv^rhM5O(6 zD%=Y`)=SaApzol?c)fffx?DG%TbEqeW>85#3Xju}ZI8U3E8h8))85otmCW50PS3I! z2$yi--cyluGsYw`Dr3hTx9B9`68xqJjW}j>OSIumfYR5lo+HuJj5%bg!bUz2Gq3#WndJCfO5hD&MglU80T8< zXD#-mk|SzezdzLS!Wm0T8%5nsSIV$!^m#4E#;b1Ew^gld>RaRQ!_6FXc`hEIp^-X? z0a%O#p&fu%-S@1WzHzMXZJY?$QPoLWS|!K;Y(1pctsGXVw??ZIm|zB=&=p~3NP4>3 zV-P_FdF*|TnEckHfm>tb6rgpj(rZ6(V|$>^*nt_)i(A}kbUVF9p+Em*2fU!;@5Ix7 z6S(;&n${2(26i-Hch%-f(hK78O=$M#Jpcd!MF9c{=vJgd|MI*lD)d4K8hR<$26 z@`--<*unwMZh1x6S3{gY9?-9U57l{1RGs_=DlCxZpJ+=vs^IZsFkqv;a6;yc0Xf(} zK!dA${y3ix7MWt}c(QQ1VBzToJrqBtu_UHnc7bU%`ib>%LQe`kA-*q+#5zG4_WE2d4R!8oaxwWmx(&NIVya}8$;_EL#5Lo8< zF%~%}%K#`cz;|FOY{fzpI`3P@(cI+ga$^dBjD2zMTFI%4XzH6(n|>--m~<0e=_yWs zQa_*k%@DPvY7M}#PNULSLGp3r;+E3aE0ih+P{+meHDL%3cAt@MiOi0{stjzxoaxdiNng)k^6Ri;7Udm!}P#XOj7=9O*)(%zWu# zrMoV7b!T+x?1$Sc!rT~8;sr~Cj$5)o^WJ;d0BVCQgr9%af)$KZ=50lR2B5IW?sBX|6?D%$H2*HIVb{ z;pD|{C~hw#7fy&Dw8xCns^YOX2|Q^HEuHXmqaQa)?{jffR|HX_t3XA!5&dHU4kY?m zvOVC$GusJ{9Wq!XgXi0_@Rw{zuoBf7hZR5_hw zMruiq;ZBIgw7JS!@Pq5&Lve3=R>ro>rN-)A@^w478nqxAGIkYMRASuW@d|AmPpz=8 zyVVWbojfi8;PQ2uf&N##m8eMaX|gJ<6>VVoWN@@)56+wd?W8(n*!t|d{O$UmpRz#x zrGv9K^C!B>E7zC*fyrS&WzYG4@eFsoJ=WTLsRnJXW-*j%eZV+&n{k%^-1{-^>RUHs<_L$@wR7w|@)F!h~BZ zZ|X4TRI*VX_U}4kiM_u6|3sA9--_9&7M+s@Ls{vlkbS`*l-MIKj9`~c$mTybNFGU- ztAnx=oc?dA4V12EQwE_6dks0u&}Y`^9S4>1Z?iD}u3PS0factLfF0yhG(0ym=9}6~ z>lxp(lZtH*;hw<;G7&NGdv`u(WJDFe5_w}^Y7g8)bmd?u1>DbR>=&bxWOmJ%l1Eu_ z3UFlKTwt1*8{XQPw&_$&V@R)`rjb!pXM=Tcp2G-m(Ua}5_;vKmdD3v5Gb+rb)oT*GXm z)I%im8ClOpOla94AAOjl{$PsYI0>ZBY{Nchxi-Z~>|2H|cJH(e&*B*VT4jKPMYNQs z%e4kgC5kUhT1jts^e|`V28i+`f2p(Hx`l25C0qqty?8m-4&S((1j}n1Ks0;U|JXqAEw|vTPj`=Zn@N~ zsvQs!kHzAVOZyS6)Vo#^Z^j*VRLIgBVN$zAp0TZpTX5;0+lsv1sm$S1v8)3ldk<5_ zD!jSZ3-4sI1Ms$RByC4yK(!}Ecle}AuPd|8aud5BDTU69f>E~{it;-)qIy6MOOl~c zgQiXN-JdKgXjm;6S zNdud(_=R5%{_ds39)3(< z$~6<;I0swdhvvbxiF4WkbOS1lOn*CKVDTq&v!+kCuR;q(LIK(|pUB8Yk3m1inT&8p zksIyxJ&MnBn+9WMFLvK@QFQYBD_$7JCQWkndO|2?CAL8T00_!K0tu+3o)D!bQvz52 z0^IUm#9dnQiSJywak0VU;=t3ZGvkelPJn3Ob!*0?JcU^f@tOkM4NToWVX~o-Zxh-V z!{Wu>V(Zt{2Iyu849lxQjQB6+^->icbbzrTk;H8KTiv)bYvz0xm{-g!-yhAY<5nOM zjqEUw)oBQtT50TUaVXI~nd*1Y#;Px)8%$_+^C^Xa?hc;D2EZ#*8& zG|`YZ9*VolMI>IFzW10hyw?cNKHc zQa^&s4$v|0j)erJwtA+v)|*kK_n-5%lIAD4e~w^C6>y3Yo3}68&HAOhBk+Bwcd=MJ zJ?LW8Od8dm_<5P$HZi5Dt7D@}N6+p=?^^*B)R0h5Jza1;e%X!;$OR$M-j`}paK>Fq zE(#}2y5pK0=dF7#Y;aXEUhcIF6D>}b3`G&6EQ1!E_V~@FaAq_@qE^sJuUNg`EtYLc z0%ksdCM%aSA+_%VTrDtzubgNo_6L|2=tCf^knR8=cLYQSvZQZOS+~~w1gyX9I;p4z zjvSI2$tai)k$Eucm;D9Dl1vA}ABiVdyb^wBVWw==Cx9nxNmJ!d0)g4unrdp#p!c%y zUf%;=llM}dtNv!1oE+7P&TX}aOW<9M`gpO7{JUmW4jW0XxMWrSR zU%j1o%miV)n5r$w%TN>l27SCdE?@%H&s-Mfc3sSodqCib4Br@io>D4YC}&OrJL~?73O1h| zDWsU@+PDMb%)F4a?GmG`X<#(c6FUdM6VI|@$|S1!W&c|no|(f4{*nDyhIZ3z;1V2p z_9W1ELn#he@W_H+tGG#kt zxDBnN3RaVu_V!XS^~H z9hzobeHcrgFfFy1RBS&WqumyhZFa+jf=J}CU?xawDM}*Ac7i4m3v^Ny;ErLwWj`s$ zA5xg*y3EC;tae!<`DO9qKc!kYdC{d()HGS1`hYWVTYjUo0}v1xwp)!-lhjcsWg%d! zs@pHN(wuX^Y3aD%3us%{5ib4mJ@Y;|;b@kei+TpAap4`ysBB{FV|7hvOYZ%zO45~o za&Kf&`--bQHj-zxI8fY^^li^%=rP28XCO4lS`Io2Ed|?apXx@tb_o|3Gsh^;l8(n* zN|o*=lzFxn~t7sXU;{uQsPn9|GwE4B;gRSs1<<@jW_qthqB z`GIlv^RiRMFS0=jB8>RxaMR70rj!j^o|CH~I35Jp_%~Y5w!_#|EbiiPA2zt_Uv5TBHs+ zVryu1Y0JcHgFF-s0CJf?X)^<;Ay3OKXBplPib{y4J<~Vj=uw$-f+>at&}U}hy0km0 zQA{E&LA7=9Qfl=r@|u470hS**^ac3scXrIaT( z3HF2sQ8Tw3x5r()aO8TtsLLu3)YxEt4m;fQxhJCpCbmSs>?#cU>D11uj%Tz$6-^AB z@@{`P^a&qg)vQhS{g<~i9@K*0DOlfDf`A7t;_{UuMAjJO)b|uojzM_a7+OF)2yU~9 z_Rx&G@WcjLDASV|{jXPf)^E3#sq~Q)3t5LvZyG^J{RYnn&)kQlZH#8-jz4PvBOS0r{`4#2(ypt>0$C(A?bf~$@Se*#a0_V{--7>~EgB78B z4=R0AtA2Zmr;(}0%E?()L+s4SIPQJ$nY2%9Yu*$xZc z97VQsapJRBmZ5tST%z?u z67+#P+}DMRf6);G_-pKv)M+ZpAi+kXKqy4)@#hMeFxjL7xHW}lcz!ZjJoN#TV$PIe zYkiXh@eQGum2aoW&;s7LNkg~vC?_mEHjhe#6#~LPz(7uFA8 zg5Xb}qns}!G}fFz9e=8&1CZnK%RKQXL7WVLgq~`O6x?7My-izYF29%rxUJ^}l>VHO z>2(-~?>W@^F5Hqnk9`uXBm;rw)WgQ)*9?b9Fz7h!aZT{a*}S(HS*KQqOC6kP`)&n3 zIVfxcAu6&S16b*Ze+pCkF9@vLKC=c-2-&2QZ^Vg^03M-CD;<7(I&?7D z60+3arJW;}*#yfS%|=~=*Wb1yQ!f`rEPS3SSt}`}O*13tjw@$QlZ!!(H1Y4uTjCE} zk1b94v;tEY==ZuW8D%~;!YV0r1PoTS1u&KAk@DJ3vfy<)wj~x4)tyXft6SHUFv67N zF0AO2!bGBuF8|z2iPC9fGI+hDDth{|JO3z9LYJ}qLD!c3V8&81D2Ek~`>}_cT#QKO zSa;F};C4sX?0I@hwe|e*k3tjfN5$GVUrv>0n=R+3`?tsnOoRYp zG;UiA2qX&!_PGgNfctx_;zHPca)gW?_s-zsm9l6~xXoMnNc~?P7_czvc4?#6g==UZ zR*`>VbTF5R>ar_-!ZDKPIa6G5;_}v`fTl_C_1k5V*1aj_dTd6T2ppQscBeu*vTng8 zog|WW)H(itZDM)Tjw2sI-BtnTPLPaK$H082{TkB40Z8u z1vcAlp}y*>gfY+Ic>6bX0qsReb&>t&cOYx~6o*Hq^bPXvm=8hzXA?*yRUmba&A408 zqSf~4I)c^u$IPowmnx0Nf^M>h@=`*Mm1*(yQbXJUMBgnwVjglO{i;v4!_kjEb8E{~ z9z2$jqXS)Zj$Y6}u||Y%ek7hL^C7&BFhy_8f48^};|T5?v3#O9gP)8(a7}Ub8w#cj zVJD`PfHr!c?_HS#&y!GSh4xVk+&Da!)y;b_&fJrO4z;u&<1+M7Pfe|d5?|;$f=tog zX?@BegY|6mF8HO`i-yl8!1xf}ecKY5IW<@Tr9O|y;EaEyes82n58r(Gi*~;#j^F^y zsPcMA2fjW1>VYNE6IFy~*!xAH;8J}VpYn#hpOe-2x9;OjHgR1XvKe~U_hwU^UDM6x z^9Z2=qHv(e5$!L)X6}lx6h&#f@KhCkn416q1PMU_f@&!zgeggsz=?lPB<@jBvTA0so-%I^Or0-Rtt0EG;j=!gI!4bS$$tH)04 zHk+p2HqaG&# zpBb>o)!35nRh zhhhx&L@R!$$!9XDseX_ks9U^=&I%SlG7Mq{m-^b2@V%=vMP`VVXxB!DagLbUu2b$; z^3!-hICqcYu-2?S>DH7ojpa@pT}7PePK-X<`)_31lIzwHrJ`^`);P!#JUv@j;SlP4o6fDCIj2s1sq zhk;FS%SXD{(SNH#JHc(sJ!%cZrwd6AVlUgUP{JjL#o*G{WDg^wlm^6id{EsFfI=i4 zc6dL*Cw;)NKi)ijcxnjSIc2KE5Wr&OW}FhXLYJ62=PJIHgStP3_iBoz-SX41nGiyB zj*^i27^)xo*s|_pOUsikGa&gZDYB2?h-CPc$Yqr;Zs2_5N0rp}`-2pkAJp~4&!7$b&_09Ey2dqpEy zO|A(}3P_s89!;H&;}$&nt|}lXQlfjm;99(jiZ{|@O)u;|ea>2rkCm9{1Mg+ri%zzC z$uOLMshw^;xVHI|Dor6Upnqcdnsk}dCngCY{+H@f2Fh!ZXMVhV$HPxVtL&nIn}wvl z?s(I&&YwVK;#Y6StJQ@I&);8SP#AEAH#Q6r^`Dh?0|Ps4l=Q4t=3#FRU8kKI%=C?0E@{e1i}Tiy@GCjQwc=|9LWX))6#_A zgu(El)+;*i0!8%&H99xE(6Lu-rtpRuboBu{(`#QB1_pyw3!!i1xN<`E;xo~T7MOs~3OoBVyCzW2>#7Bs2t5Lf>q z9-388ClF&CeChV$AhZfUXA&wzi^y`1vYJV!BqFAK!)wZ!FlpMb`Uq(16ub=p&_FN0 zBVw{e^*EumPlJ@st^C zjvCFZm>@DxS*1l47vg8U;-u`QB5Ke!g)7Y)Fw$9?F}3X?`n@5eRqt)qVQui7dxk(y zph#S~h@H#fG@4c+7P78e#7+u?R56IK|JO(YJEq(5rl)Go?`u!fmpDq02GwzMeY8~s z$*}No6;?i;pBLIG5L|>eNL=yG?QtrGOQlNZ{YN)eJnNBbWsw0ipkafXCAs>WcWk_- zoyX$=LSAcwP2Izuy>l-k17s$hg1qwy3fE)7< z+et@KM$?XtXoMY^_R&s`N&8Kiu87y{28QXX4`kyJps}YzDOn=#7usfJF3;z`;T4IS zUPUC%1>$&Oc#}!)Cy==9SHh_Psv`6|_kfShWs zGM9s|b7fKEpEZ?Ifa@`;B3!AXFw{@L5RP5hWD55)-Sw~tg=lnyGnRJP>C5o~7Ix%a z;sI(uU_QAVVl8Yq`N6Iu;-pHSmK~j|UsgvIlIU!dVuB4P!PJO3{92Xhx)#$ad;y^2 z0i1F+z%Ofhb6l?|>E_`VWyc^b&bA%Mhvjx=msq2aTum%+ZBA+%r5!xfyxjt{%|w9+ z;Y)%J#;(NfOY(4>C?oKrL;x#$YjV4NZgc8hQGfsd13*Cn3F?GXMAQHKsza-8CyDUG z?ihDVY{z0xjhXB~Sy&Wt4o*VOwvVfxm0Rw~VYyW-?qpXD6 zc?;tv7A?@DPMMyGKsb(;)FORjW@N2l+K17yg-Q*aq^1LTUq$vHgNp=B8}0riMGx4Q z4O21#+f1#}!-E$|AAtGKQySKK`rrR6xz|kDSZrrx&xe>!$+c}GLKup`B=c1;I36Sn zU$w0yS{QGKYRgp-wU1N#F~nF;@_>aeEM0aWNI(-Y*;QxN_GHY2+QS$@=q7ernTcD$ zxpHA}IG0$MAEI;R@}QMO@1C|~zA;q+?4>+$wPpIzZm|YXQ>qCBEg?Em9Pxy0 zN4a&w%+((CS5=(EiMz1rr;LdDEUYpZp3Esy#^2{7G>ZB`Q^^fEb%9XEvo|0#0%ip{ zOpwEt!I$rKaG|%Udg9t2uI|dLBh8$^YYU)aN<9CDg8h`8qTg=XLd(Lkx)CDlygeEzJNeYCNm69WIULXu1{N0|_se7bx_nMV#^nkv5-8hTb0 zhs18(P{$MfvI`7ncLparA4zk5ybpXLd<+LfvSb`IXZ;jt{anxupl&ie9w{kOFYPY; zhi##^y7}| zB86|i>lAb3fRG}`Q_~6#R~hq5^L0)7df}OKz0CCCLcm)`f_vpzHU9DA7-a_Z8IMEk zt}6MPe5rKCCYAARD$&?^r{*JvzPpLX9>Z{J0hw#)ISr7zVT&)8V$JGMCG|ZnOqdM_ zL#g(-Dj~qPT;_)ZxE#c{^@L)}QywNK{POBL9EC1oY{&or0arl*0(zko(KP@5x&4Zs zwda09D|Q!u7t4ZlAudgiNvAB!$~;>Eo?!jVWIKTcsOrjppc}T=NL(}|$4jqRyPpqE zxFuIq{6-Dk(TcO!F-DFpbP^}ccAMDh>jcn5l+apzSATS)s+#<<<7^w#sbDfKECgoL zU{`59A^q1}DD;WQ@5$3PbA%ldi!>B~3^^C_UMyT+2w$>r0|me@%kpfbwi9(`$AKZ-!a*IAe$NCc-p|UiP8@KjOZ18@nzLDmDKE+T%vp zuJ)~&0#=kx7>%apAM43k_Sk(?1PPNDRrgcuz_=Hkf7YD;h##4LyjY&j#DaV zXFS)(uB)Nm$Mgo=v#sRnU;qFBxj_MfdZ852H2Y3zm4r&}D1u*pM-(c$N>9Xqq%Z%1jzsDtVLNqr?$n%wJD1_Sm2Bx+Ax3Su5%O zJI<3w8Y~D12*zOVCBrzRz|U$B(Ij+q4Zk9zH%9YK)t60^P&qaY-hlTo@~v)St1z?- zyTdLco;%01t)e`UiY*LTRJyQThdEj1r{EpGK=>fCg*U)WY~Ep6lnL)J4<9c$O|Sp} z0dWDI$8>Mb0S`gL|7q5ZW_TL8+9dO5SeEySU#U+a?HV|$hFJAkEC|X@H1;lzBx^ zD)d94IG50}<{~GGT$dxFu+nUJNr}5x%wS0S_QjOaN-rN31TosL%hKJJ3`-KIA+&aP z1%)f>?}A^~rJTw0l0$^}yHi+Y!x*W_)2rc3!<1on8f+pH!+9?J2e+RMKT$ln-7&f! z?>Q z*+VYUu2PZ@z29M{H*gKMy*R5Qh&oT<$J<^(TsH;IP}z>ZKdtHBvWw=n!imf?gT-~f zs~1ul`9~=(LcEM!f9sI37OnA?|MJ@&maMn>ch+#&;UawF*p|o)N5J-XZ!t)&LR|k~ ztu?KK__Oy%q zSrOTRf8E@E^dJ-tQcm~75AoQya0^aco8&(lsW7mqAxHE&HOGQa&U>Az(_jvFDR&Wf zK+p=^SbZ~mFS2J$L-c$9zB*Y0niV8al;(fUbpFreU;Kfsampji=Kv!NO(sLG2QJfz zl#EtkY~Q>4y6^oYziE|I{YvciQ?_;rkPQ2t1@VaBi478P5yv8aS~dsvMG-ZmQ0>Gf zA14`J3r(5Y^->}G2gS8ax_zX*0!fP2)3ZVtP|p{Bu^*^ua%cLTz*)~ofHWy$G(vJc zs9oR4iEEo9xU>=XqrZQlrVvX?bY8X0mMwjhpFnlh+%d7zbv3a_ew> zD~oPD4;!=hc;OMGUz4ep8Tj18a+lsb0000$0Re)09nmxY*i$p;3E{IjayJGik~Lwo|3dRo%5+aFk43CFd7BW|HYQq4S38sj=hnK*z`t7|ijtSfK$9LTrI-hqGuNoocv zB^=o~h>5tsA?oI%30^&N} zsU<*P-BE|!MPrUr5|}7=kGm|v zB}nmskenOu8K~yv-dOymh-`jsq~t#Nd|aVJK&7)-8|jWyc0qWKg)-{E`N*b6mhuHo zsO<=M6TCV)ehgD5W8~Z}&WK)ww3o?m1#$-gEB4b4uD4oV+-oFWE3G9}n zL;v%BNX>32k%~_1#s3wrH-lZRIq`{NtQWxrbp2*R~FV zu7ic%cxPDM2R~H@b-o>p#;si@*kD8lC%-$qvM@BRf@0ut=9x=Psi;3I=Y3;Exo{b+ z?~v|Tu1lp_h&O6Ai83t=lOYPINsn`^KH0s!^L0EH{vNUIlc1KGK}^Nr29@7B|H1eq zuVtyt-@l-d7M!626&r+SZl`_g=Txs3dP*uRU_Y2qmfAlB@bmn}SYwd-SSd#Av|A#X z*9wflX!z4-Qsn=78l?yaMZy3e3J=r&I6*}<$iM*>liuf6J-P;0cS;6m*dQ04+ast3T>H$IZ$d=?sC)<0!9XyAjEzgB&F`i80gQv{ z0aA>DZl3!ZR{kQvUCSL2EGIZ!gtVCoBgHE+`QosVgE z5k<4CTA}phh6`!te>Af%y@$$8j0OuKh{1F#*ji*7>NF-mHrU*7D!1y^$r8e!I~W_J=F z7y!mWg%fF^PlY{$!LMz;FC_C)48wCok`sHrg~<{zFyENeCK>|yvg0PZ+P=s-x7et- zDn|}JM;vjYt_PO!B=gQ59`D^igVP8d6lf2?{>c5ft|%x^*U(Jv+Amc#CLyr)jk!vul*csBd;4m7hO0Q(nd@ymv9>q!VY!{2AxDXRCVyp!m1h5M?eK7D%DcS9|KKofp}PDsY+{^Wokw)B&zy%hA`D20so zl@jWV>v2ORP1K{*x-4~|#*ncB@#uE$%^kPOl?k39lVS#jcA=v`jL?{3;YW+RF7e>1 zy5toU)?vvuFEUt{ZPZBn4r$bqoIhT}x@l)LI@;J?@rl(-4tPoBrUxERLO}pl2 z2#}4|$Z%D+1!G_z&r698iw)xn_1dLz-xG1zebM%vCt!w%MFVlaQrwuB1@Eyg;I9^! zildmdfR5BjD!pvtUn2X>w9;SiNwP1twnpBMl_%~o42C4t)1}TM0m;nH1A0geCCg*6 z*e8<2QqLYYr4Sy6I-x6E{GI9P~_@R;xpo^L`9C&X2Eja^-INWwk_ z;5tmA0BuXXm0M!-(}JJZo1KvY>5Uq{6ZJ}G+uJfoWW*H)vL0&11Ofz$ZnmXtBByzE zZc^5#lLy^+)n_2_-DS9H64B*b2SoRzjQ-zZoQ>|oPerFP-Xi%KTQome!Y__Ms}tM@ z`1_m|=hd4=L_pt>{at5HnIcUq=B?8RZ&cq@z>gR9jbd?=aAP6Pi1dhFh&{JE#Lc3L z36`Fc-*1r{n1_JUxDk}IckNj_qr*}@3Pr})#qAjVya>au)ZEGNJpD&YyUgB>kob!E zL!Yh=cU6t!?>91=Fxf;H&{ShPS70uFEk^5H7+PwHLv-SK4B|q469@op|8fza2yjT* z!m&P(p*lPCRIfU_A^Usjl`}Sw$IZv8c^S!YL!zehDV64I-Wo+roP7OuAy=AG7N?w? zCauxc#~0mq&};!Ts(S>;;FeIxS*1113+oO_lHLP8z=GJ&Sz;8%N6sF~dbtHrtU7<03;FYb{tp-yY8}<sGJ=K+fZ|b1B!T4Ic{BDCmA7VTsDB;no79rCgD9NqM{-DbX#?^#D9oeA z&i$@7`cWKny$#;e$1bINgen#893bU?}`Y z`#wIRRjecJ+;T-SX3^?C0G-dSrN9o_a9ojZtVg6`VCp*wf7@Cbr9CLcWm z`%II2X}y-v>;M1^{6PWc^(;0A~NZm{DN>gac>JB+jtv5?!m zLsoxzBscN|KRQ9vDuK%aRm zgeFvm)VkBkdSsG8WwH~=SJltr<3YTzOh=4A0iOJg7%eE=Hln2i54APz);eZ_J(;R1 z$JCRqY4?rjRtQ0gUgmu6-Vph3;)4oK^3aIn+v&4&MFa)x$FH>aqu>=Wmyzv1WNBwARQ5LkyfT^yVIt|VeFU1Rha z@h5?2X-N>o(2~?{oNJU#4m+6OhLAVm=}T`eDyCB>Z_I{o@-nIqPgAS7*Qf@R&o=#f zj``fGCdAV(i%*Y^nt#tFs&lu6i$FBENib9dCfYO-&jC$8yB7E1(1? zQT}5r^dvxefPnpaP7rmCUxECm5#}-zcF`XYgT5OtjpA`{7JR~$?Qr3U&jJ33SM3; ztoI!z&|Al8WW#Gz&p%l@E{zM}Tv$rJ? z|D#ByFtwweqEGKnfJj5JaK*npf&g$g$a9a5`-~=-Sa&&!lypKIBR`y%4Y3_%H3M1V z-mU3&He6|tTEh!K2t60{3%4iIfX#s7*~ z;b+--Bj1%BQ_wpk=>@UTxYyI8m3q!#+s@=&v)M*7^sK2?Zd~w zOUD|l82SwR*IAies#6oIt2YJJ*0uN+_Zm7Z$dsDPmd;xxgm`F#H`@MJJlLbkoQ%^S zU$T{tfe^6&39_C&xb3{GHR=z-d}zO3xkpVN6Y5h?zy((8sS2}w^BUwF5DPI0xoxp< z-nn2sXe90RyyZbWOE~>SlIE*YFoL@C?b^8_Fqxg zSp4ry;lFXmSeGb{M9MS6$1?0Ra9uGZyP=sXu^_o0OefEmJsRgLWe0l||RW zFdzQ~ol*TrD(TQ`6(Q5zi*gFaC=H|@QItUAdO)I4RkFzNULxFE;z6iCRUEfyyLze|o6^G)%mFI=P z6R!5EH^ee*egnD+Yukl?BT3Rpge#E`V* z$=yQr2@df@R@1?dQAE@JwB~55lyg~+=8wwKw;x5Y1#z1n5TN3>mi%VZbn}~HZ!a?4 z+w^S}K-W}xn#&SD`7B#4;FGGf$*ks{k8^pe6b!F4Ic(7H&B;XFM?RGgUXIUCA4x*x zI=-2V_%Sbj%ldNzh}s&1Q)h;!Ngp6Vr*o{)XK?=@N#}UvgNL`ei;;fUueqD%Lza`= zIsa?_u6&#P8&h74RgxWsC=4||q&m!yzFVAY7UA-u+KP%v8_r7}Jr0vVW@;DS>xE2w zjbXcxOV^f*jfONjF=dt_Z2q6IFqR=DROl_l~RN7X*HR>Zhn&djhzS5 z@1|WN{2XEKlDAq@09ev4rLt3glok&k^TJ>+nap>Y42nE}E&Xm_p10gCwo0@Cy+MGtjUVG;yPoV~=6zJ4B0# zXX5eCpAxm_cz0HR&^&v4Js2E-;w&e%#F*MV71=;fqpO9taoOc*`Ad>)$>?n-J=#MZ z>*{$BZ9TF+SwOorS>|zk+~n+6>SV3r7-SgAn+P?Xd&-zaO?Oy6G>k1@w9^KLDC9(F zzZadQBPmUxvrM#C+Gi3gKQI)zpKf6i>74#k{6fYy#8J?Bv?KX0cOEKw7Mr`L1aJ1kSF*)p2f4zrEeq8fzBNjvO zfT&GQ;mr95bbrNmPCCxPz+!x3P%uQU*NG;V9p(>J7nkC3K|U^$<)OzeCI&2h0^6d@ zfK(9I5@v=D-Md0ZDQ446W27%yz`~=OD-A>IvWijN^lE17SUpq>Ojj!1`F>`4+gDv1 zA`A`!AFeb~mUlbt3eZ?7%eld}tqIHWR#oz8E=^QeWo0u~SH|O;f@T_JOHCLs>IHg> zk`ZtcX6&>Y*-*+5GYonundg?3-G+e$UK8+J+8hYISX?zuQ$4e3jcq@IHa__D`tlu|vp9Q3pB zA=37yB#}u^J~Ihk?#07Vo?3DH`CCoI!cn6MqX?Z$UGe}Ar(C}f5+fyzYhu^ym9lmH zFE=uGhrc0%^h#ROFZrsEP%tl&PEiLOshi<;*@D3_ z(~%jCCom|9Ew&0GrwzYtm&pGu%J8&Ke9Lb2)wvFvjUu%mR>c$F+#b%uxc^NtlZ+F_ zoWmETH!lEyN_pz7x=871{g*2nSefflY3p0*hopHscR!!*gY*`SRMF0D1Gs{x?}Ly( z>dW8l`34kV7UEzf!!Ewk)wv`|iNjm3)cpo7x4RfE)jL0vzO>9188*BLV}uM@82c00 zz#<%ksvLWa?=N)l(N>~wK{v}5+E5#Y+N=GX34T-HE^Mw*NBtKLz00kNZyPX+SH;yblrP?(+kh#I2cLO3MAwh*wk zM%qnZkglwA86I(jy$w+-yDF)UG82X&jU8$dsB*XEFXv9d`tSA{K^FCK_IsXxw@_6b zPkq7qi=sZDHUUgkCO4T#DD$~xn*fuYAwqRvt3{Z}3|Zla@?guGn0fwH#q01M2gED^ zHG$I}@Qg|Q-kI_AfBP#yb@EDMrgh!BKjT$*Ma1S0LmFk zwK9TXxV&NKk0Zlsr-iFZbB7qrXeH>)zQtL$}LQ&+79_VOsfTZ27`* zFEae-Ct=n1QqeWcg1^RwxL-7;{mx~fk?Y+B$nJWjhwB`$G30yIo_Z8)s#Sor02t&7 z_{U`_8{<+YwWO!irIwIiJNkqjvjX=R76!bgLRS$52VT zaJwoWhRA`mr~m*6_CWyxYdI#N1)^mzC4U0BF#P$(FB$hAaP@-W%u+^02i_2req>sJ)X5vZhx7SXgORMLp<5%KE@F#+@CGIAT@^45yP47ZCx-HR zN8j{T4IZ-B27~>mLn)ogb!qbZdHR6PE}$fR7j*OHR+;2&|UvhVlGlRuSf-kWiRKM4Nce>mDuauQxL&I`O7_}K}XrAxp zP~1EQe2T%IGopk&1yLBjcNCf;j6@UAzeZVD|8q+51LU6RjyL9>_&Awyz{9er#g7d9 zft{D)k2t4($)k6S5kxK4SIscmq%BsDQ)70UmxZCsd~D|jmXJ;Uz{acG*aBZ{S7SOx zz3v~);HY=0-5OgIH}M>)&S%+qQ6z;vw8j*;2P_qe_fxrsP#X|L^rR?Hd1pJA^3jk7 z>so*F7lSJAk3#e=+Rime`m-6}IC`EY66^Qk)|qCdS2oXgZKtj)L(5Gi-uk@93mx`K zXz@4P-7p%&D-#J@jImO?|9LqIbELO-3tT*TzrpV#3P$*|h|lx*0W%qxRH-wBx(4~% z=TH-Oy6JeDZI0R5AZ9WtR^V4P~u8jBtz zj?q&zNeFUd?&Sx|JW5PQK)>jj(o|bw5RX118ffS?OQs09p!U)Lvm|O{X)hMfV)vk4_Gcsytf%R$l1U3U#9v zLB}dJvewoeM}>^rByLpSkEe}6ugg5=oipANGz(|+TkCWdWFZ_g0o>GcwC`d763HJ zWDc3s@khlkK{^QHcbF%(-8$iH^4LH7M0N{f(2Pe+)3Ef$XF*lL1d^+5JBTQ9Wvt&RF{ zU6OyR=O}3bR;2gHO$zydqu3xzcyEV9uaZw0X$qN%PJW8HY>S7TFb}fRDSE-xB@Cjv zUap&KS1atk(-fWyi{lK~<&?4zMZNcDNPf%xEJ@Y#6wT$ZZ>SBq2Iv5r;BjIvk?UHn zcX8hBCKwvJyC3)t<-#-|V-5VmlVwN#T2Of0AMXbsi=862cbhY6Scu7AWjw6dNhbhr zI1h*haSO+y`lshPhE(e8n#7t*5(sPaDv@QX4Ck<&B@hev1B(Gqb3W=kYRLnQVH%Tx zz^)#UVafD2X>IZ%qz{@N9gZ)oiP3<_WU4^3`Y3nN3&B=@`_=;SA*r*J4|nJt?a;+? z+ts1s?$2M-uRHV%vz^4?>1OGU9={ws*W9QAY)i6?M12jms&-6%Yg>NWvF+Iv^%d>- zwwrKxhRs&}LYCIb^DJHTm4fT?N@JVmV?yF}^Py}e^}+=Ihm;P$&_F3Gnd)jG(H#I+ za~o|k?+Hp-8R&>pU_1*rD=PVUbX211(o(xvK4fFSO>(k|IIeu zsZsG0Tm;F#Mz>9~#3`((P|d&-o=qjNRNtKR9%8oPLrSw%j$CgXQ(_k8Os1b9@}qwQ6V_b7I?n_qB#e zYu0ghM9NLXkKkV`TAQ`KSzMxDJupT|4wninPEO<%yc1Ytq0E(fY{;Ml{?37B`GP$8 zZ@eBJhm!^W4aRo^dqT;tX|JeyR7-C6uh-=;=EaZ7R!e*2JZ3+7WhFeL68q;Bl(n7P-(9HftY)1$i>@u@-5Z&opNLwg z87K5!YI(edVg~TUz*ir;LnYTtu04%G*`?UI8#8AuHEC&8%hkDY&w&^AW3BGn*F3|G z8>IHgn8pl3^VO6E*R9y}-nQrMkf@&KJkW%Vz@4G8%Ly+)QTV^6 z6jSx^XYU<-iI2YjG2)syM)+exjnpGUCC^hmfQYzX$ucqc3<&5^LrDCD_jfyt&a{L#t%nb-Rr!IxoDYYh@pK;lt8 zl)Qi`eZGcltHqNXv5!cXS3&h6i_MBOmgsgunk2Sx)=YS=O*D?a5&PjHK{25GWQY1e38-3cX6`sAQ}QE6{UeoN zQEHAAbseg-PfwM6p?E3j2Vubt=&8E^1kExJ8RG+OiXGy;vPu-wm>fm(huZ!wM?D7p zxU$D1f^|)@qRKz`oK9!VXbQ_;ly>A}(EUig+MtzKp8nTYSm-1GgKo!=8Upu4LIn-D zpdW;G3%7Vvx9tSRi(ex!kp=2ZT-G1cmfiJ%p6QU9H2$RTqnq&Og6f~p7O($jK7`Pm zww^c78-M+MliU$*)VAe_LF|7yey0G;*dXXC~=b@zEQ`_yxI&FZ8e#9>{S z;a;eqe+E5TqWb;RxjT}|LM25V#;+s^`Ywd3th)S%zTcNT4lTq zw(_`*mBleHyB2uDlg1!eUa<_#z%wpFslj=6<^^#A$=evE1gyf$1|uNqR|eHfX4gOEStWoCNqziW_C#ayfJDSWY;Ev$KZ$*>YPqdE%>Nnay%i z_j!{5%JGLkv%Dnv#0g_yo^%-_8YDFQaAt0fEav*1a!IX+ly=K4kZ=G110X@3>_rhQ zuh~QdE6Jsy+IW~=DT$_coGy4~t+NsyCp3j5NTc}A#Rmmx6!ob<*<_1OwIi|VKAS|? zv>I2&K&y>yf-Hn9H)QsIRU6M^9*d~q!2_HOF(s0SKO-7UNnAGSn5JPKCg{h#p69e? z0tw)$$Z3+b&9dc%f;su{EBhqMj92U=$dREqh#Pi%#zf7Gqgk`x73*XUs&K??sRp2S z4eJ*MkdEVdkOmQa$nwSwlGGzaHev(WzcDf1QVX0W+;?;CRvJ2do$li;Lc9e->DR7Z zh@JWM>wwAz9EqzoE@Ljf=h%i{xBgyL&cky|s50XSSyq7?NN-<}MPET3Kd7QrL&8NR z7qFbioB*&tm0Ul)s)-$WUDDc#`1j;0!U|&HXBr!tT-Q2%2WoWQXTYCKXnj)dEnoiV zde+eVxcg6*_JddK{)5uP+I9J5W2bjd<-y_y4aHe@0%y$tEIUV9j23&XEKV4tjHot= zFSsfb3|cUw(n)(82_O8a?X$n`a<#cq!7t-A>fAz?6xBiHh<5LKu*B${{{p4!9aL-1 zT)jJBvYA2ooMV3VR*j6mhRxwIrHp{x>p=##stSg^2qX54g*-kOdI!!L4m%d-sA;rm zFtaRbdr#KdVGyUIcR*oyt{n4+oAW2p=FOGIkpiE9W-Q_hNeolPv-;G5B9JtauK^DR z6PA*oeoNT#;j>HFoORAw~g zH-CJ03a?ZGmWBEFX zrp)U4FwdIFO;G|w*CG8exZh|?-uRvrFBH{BZr@nEgW>1wcBat9EJD#E4YmAMl0h+ssPbL>Syjh?Qs zjyf8}&o~*a%Ks5Iz4ZBT-+y0Z5guUtKIc=tsgEzLk7MOKIw)Qd1!)A`AIJF>>bQ>U zbk%<`@K0PkrB9IVs5VQmYv*tB#DRPvpDpa}sthpwz2);n9h*_ZSce%))UDD;#+N?e zOHZBYVXe@fqWcI+o*eT$+sLa>kXz$aTfMVM#J)yfVQU!3!iZZRH!(_zqcE=1X$s$} z#UYR`4(``?JdHo<_uBQ-D@}3E_5*+M36_LQkpM{3G{4Z|Lk<>31phHyKFuVba0!94 z$L{b6qKwQ%+KHu3uOTnJML>ZrN{A}!NC$I85~P0!b^zTxm!^xH`ip71tpsNK{qYZ1 zU`{*Sr_Sa!1beVFyv{d}$*84hS@9QtDMjze)I)UjYr#XZ^$gz#RdsAKyq4SPAHhP7 z=a;6KK%`|gmLk9_P`>kj{wbhN8We!??>Yx*-|ePm!a8Lb$zZmy-C|DV{CN!E0>&+9)mQ!CRnUjs))ePN@;F&GL zkGV*LIn~#4bT2{)tO^JJsCtrSRbv-I+DX%7mEOb??#`u;-r;SnT?I4(X>wPvUS}aJ zs(3U=KW2b0oEia~dQbg?^*(<)HS5T|`}RM_iq&WhOQg$uny!EvksS){@ET7qzh&8!2ep`P?EuaqOmX4^l9?u?48S;U%STdTIK-abdS-4we#igg~<^`fZ5ahag z*2^2`gm@+|kZ41Rm@r0|7>|YAuCFBl+1oQ49|}~?_-6}K)DFi4DpV=5Z}M7GvJolH z2s6VANW@q#wD?RoGxXVK6zlt1BjY|Q zJ@6(O%VcpF)uxvc$3sH$J1HBs(S!^eCjw^z$@uI}!VKfNRh-tkw6kz&WoCSsx!B3A z|2Yp~0nS>OtPTiOs!~U?<@wrSi8;<5yfE2fF-5*c;KiPg`YDl9D9VDv^5)s~vac%? z-U6y8A`gugh{I=`YBvLD!&^TCIrhvRhVlXhf@U5{{O>{`b~`!J9H%vL+7$MZv;M1(;z0odd$ANmvOo0!wgMUxEDtQ3xDMS!*TT(^c^=`Clw-s{+>7JldIbqT zM@7+C=hUQ4`dIb8BduQqv}vp_FA{QXspG#v_R56{HbMaV96cxp5jemvKRyKjO;m{i zfGC@mnd>JfeG5SJ4@~vP^@}~17m`1V&=V|TK(yegoev-)0W501Nf4!7(-=`mO?@c! z!XFHyS`4moBPBB6PPfEA;VLR^cBNG$e6M()8hYL*RB58>iz*Q2+n^_M3K(W1 zR8MJES=kr@MS#r>hZsMv(}JOb%Kixryy3ht25qw{+4@Hi&x0g-o#o;H#30kc!iisO zoQR2PLp%}#YZ{ci>3R6#$K;19Tfo#T zV6lUr@c2E2M~G+??ms6v=#AXGqt;lh>hdxQUzX6LltP9Hfm;;Tg?7(T6cP0Ajx}p2 z;^fPpnr9$ZVBx0k;T%LRc>@0-IgFwTfxulhQwu_~$1xp>aYfG~Mu{JueKwqk`7~VG z4(!9_(IW1%c8&wAI{#1z21{fc+ekF|lB3G6zpf;#KtKs}ToScd?(c{8=}f1!0jNvY zZ52g%%k7J8hff@z!zPB~V88t{gHdb%009I+0fKw66hyTD!YRu!kxElCzwsEmHIy?} z+4fSef&(`4I4CgkDS?;@v)nW)QiuW&E!C$Ykk|gC@XpRs7009#Lp9^$v?p1o&Sb6|D$X7PnO3UcR zkbh!wgW+`DHy8Z%3vUrnvH`%@SVdEI(G}=x6@+5vpQ`o||8X(I5?rT$p*p6S-5L=U zRGh#&m84G&ZX6a0<~|N3g8dab!&3%>wNamqw<=4lSb2>KR`lT;QZ^= zPk%#o#6OlT@xbpS4OSub_NmzC=t5XjlUHT_bH8(tAua;QfOw{Ty4mH2`UwC40yF^v z3GlZ>L;v{OI`plvcXi!J=O&rU&2SM7UYb|53V0R&2Q{927U7(1ZDS{-!qE}SKRAH| z4A>|JhK&UcL~pfp#Hs4TR_GO7wA}f!q-p}TYG+q??4^>YNjR)+mq$(-oAFFcU^S$_ zP)$aAr)G@Jh+)0u>qa3z$<;N3t`(zbuapvM1$8n}_1^FUFl3ojcpBf&i0Ue3T4KFF z#mj(>MgNWvki`%LM3`%ClgXA@UJMgtQdX#xj=MT=yC>t*Nhv#PN@eeGa_=u2&jYo*Og{Gm<=9FNArt^S=Qe$pvO~iImjPV zRY5+bB(#XiY5}LVDi=LoiCg3+vLk(88sFiac{d|$h0~x)$3km71x*a(f;dMRCcJ&m z@}daLnB%u=$}u96UlO)o*fN=mU_HmH;_!d_|KV+hy!w%SO?2C8&}x!%HQ#)|uM~FB zw94+*_DGIrU|qe4D^#GiIXbn1TJtYgHS3zs+p?|GZVUKV88A^~kTZs&H)M?@BzVJ} zYn_SMcT8gxzV~*1HyK0!+Yu#u`_2tqg9CpNzeH9?>e@2cdCPC=JAnWI05Smq0(>pe z5dZgXyA#{4s!>e&n9-m9Ei{%y-xPd~{7e#bk#)*=@jo!2iaPi+&m1}-0000|0Re)1 zEzvXo^t1H?wk-2^04QRUluH&qizTnF#h9;XB2R}vpum<;!*zvIH7w%m!z6`JwF1-E zSoCfhmbMA5z>jS4{u_0-FJ!4Qg-9?pR0yX9wlq#e-^#bHw{L zcJNTp0gN~fH;Dr_P1gJ8pmHGM*-B6Mb+r}JiE!SGL<06|11j|#d2HJcDm|4DnmyEm zH)Ng%>0D-{*rK@xj*q(OWCq~ZTR?`8@rnG+Rue@)$ats0Z_8F#j$dPSw>#v0tt~)t zb96)Av>(rAuA23saL<850Rm2WoX?X;g{%n6i-xlb@RaTwdM*yD!LmqAA(&6i7 z;pm53c`#0yR=2=&-Ta>=^mv-Ji)?QT%7RdPU)8D?+XD|m6ep(-yO4c&Nz`yLwN*5fd5$kB5fXl*M)xw#W-~NE6+&iZn0|$&&*)9$YG`GZ2yQ z(wDH-*A>$(4Q!f}MFwG!aIZX^K-b=fOZM_QMsk>+Viyd>67J@0i}D+{%n{K5jgbBt zR{8HgO31wc#$O0|3oR+rl$R+7JoVV~Axddv-tTsi>%KBqIq+JsqJF1aq%Y?*>*Z7P zM+-|4c$6TbFz`*m?q{C44Zm#+kpD{T@~`f6e*uin5<0!iTrFOIVolOR;b&B8(PZ#h zi#JKEB)r}e-~idDRh+KV)EKke$3sYGqI?tD#iDSlxMu^5HSP7yoxj?<(+dwJh6QX_ z0Y0nxA9H+3%#qd}r2TW*8u9@~bqbc|Ga#EQC~Xg}rM2be zrMXRe#C$U+c!%(&%tSr{24!9A#-OOk){kB^wg7C#FN2?NeQu#i>0I9}>ngj?$*oyb zIRwct!?HiRmKrOXjm7d7jzMX1X7ivdNonkDy9jC==VyfLP=nbJU*4r809Fnz;&|g#Sc58y<~$KV`tiYRvrmCWXXI<|P_e|5<81C0&YS)_7HFT_RWL(|iwToy@Tm(*@^XlG zeAsLU89uUfQr_-${XHI-1d#;>m$&4N=kYP-(ICizP`#gxL79#sql#$XL71IeycHW= znr2q%9F)h+eT3&G+JI%|kDw{*md-^Nq|qO1zBnr*3yR1j=s{PraGZ?0eMHDhb(lZ* zeKXePD;#g2n@}b5C-0b{2w^DkZwy>$HiAk!=G#{9`5-887AwN?s!ZgG`ahEJ35QJH zy+oP+uB6CucHq3&%CEKV0p{t}91o-PmoHmJ(PK#NDVJqH1KP3K%kRI~aKt5_977Qn zLqQSgxeKqw1zx620nbS!+B|=_mZ&C)c=^sysl-6}!(`~&=iGBIByRzlYQ#I{G6O0w5R@2ukG(6uISenxc(#(Q%c z#y0~KAzp6Dbk4uA^lp0mUN8vdAiPyrFLaiPMr;H+-ftZW)xlqYfA=%L;pL3BFdM%> zcg`yT!SDK^Yr_gjw2RV2K~^%fhs!r)ePUw@Fok9N1CAFD!HES?rdA52g5qB^{dH3F z@I0qQ$2!PxawbbT^(=O2H>SWrq|ui6*2hCGXvmDJ2)8V8(hrahj4RIZjM`q8+lg90 zn#bwykR-*>8EW3l!)hc#O0P3d--QH)Ek=5~{+0zE{>EHD=n7^b{rY*<88Q%QKOw!* zN4SS`1P+pyMm$P`FP(-xh?7Is7+MSPF}zonP8?OO3K@yvkzsjKIC$!8>{R1bb|3pn z?tm#y`yP)7KQe>@~I5e}&2DPn?yCqY-3EfJj>EU3oshfhS5 zco9E~4T`y@bgNmBi8O}Wd5rMju8j~=LmX>e3_`P2G>WUX@n!`Tj}wVg#6i`pAT)%K z=lH+d;-5U}0#0O_`$GGElPHw_tSt2-;!}n1{j=(rp{q08#cnY*9Me!e>h12vg8g9d zjUSnBJt0tEj0A6`q6?OE{QKIQ;?tl=rq#7+-OEpauJ}37CC5Gn-{m#!*)s>Z3++vq z_&b8yMSt-oexY@5M#&nF4`%DqHyHinZ+ouDAul z5dVNz(uP5ls&R1t^g&IsQD`%9itK9`!8a<*75?0RG8EfIdOFbkYivvKYn z)0}1BAC9T>LueQk zdsY^~$g5o&{IgBUxrb{;W~40ExV9}TZ9 z9E0p?*evkQXzZV%Fmyy>Gxt?-CIK`1SU@JvY;CKmk2zU@$AD_jdX5o{a3IfL~`8L9CPWd+z#9d2N^AQ7cLJW*?5Z9ku&sOF0J zH#MplaFDbvSa}+%=0sRGi4O9q6pz3ta}|cM)*)K)4dUDViZOBcYN*lFOY-y=Q<+`5 ziERUhq$e`E$|7blB}6PNbQp z;T0mg>#F-|4g**)vY55#9;GW7SUIdTRBEE>x9SjEn~K@J!h`nWec~PK3HtIc9mB1{ ziv9@qZNE>&DATac%p|f9A9agJKy;k!b5_bb4f#_i$*>{x=1@yi|O|2XucDbI}(O!Y{WC$DE+EYNGIvOcHpBx zS=P~RyxuoSBX^RvYN-_K`(^FsYxoj$xb69@Nr}W61Q4AEu?QY@;=)Hrpoj{bfXfg0 zCor*>z(szNA-XHVKsc77WPopj!XrjG{6}eVa}+}XJ14!3K-yI$@xlaY((U0}DTInV z1eLCIamfSf*i|_bt?;R~nfJ+%S^GS?nXig-lBIwV=0c{%fmWx^OY_KA^NDAjmR zE2s10^&c#>(*EYku~=EA^R7R{4J)4_sS&lcgHl-kNCKpSuG72*SckRB6 z#U}Z@*(<*JNL2BG$I~(2lzC6uNB>3}vSgP&UxDVs2~FDtBzrA4>*AJ%k;J@akrynt zl~nHreM`xq4}UO?=eLQ9vv_c!I`H}}O_W~z9%vlze_*g$yl)*W%P{SZHkUfi5wwg~ zpijQv1+mU@+h-BDf{5M0V020e>wxA-D%2=tq9nq2W9lR9jZyx4~u_M>o+_Dn&V{`_agS|`qNKL{lFuAE8yQgVLr|S|G zH=8TkP#fP8bBP0M(X8lesDSnP(MN7?BYf}9s~mim&DP5w(ycqyu-=fp@4!|I2prqx z@Wqowr*}liN8w~&t@dQL}WHhLrji^~aQ zQRKr`iOpCI0`6?2qk7a*hNIj@57v~y2>!6&R*=|*-|uQH?PUV~b@6C`p?ZVUhex(+ zk9VU*UAtmndpnn}SZ!5n{@7P*r@jEYq3->~tCU(wvT8q=d_wKh`#P0;f)7$BI4L`j zP)f;YRRJ0n+U&F+WjC8rw%#jaLnn`oksZ&}e3(<5I?PRX_)x#}M_8)6x8{8V$?RKf zrfO>q-tXQbrdb{ZT<50Mv>8k$cA_2_j>ZEuay)#k&tW`NhpKhU5I2P)iwCLJMt;Pb z%D_akkvRyeC86>?8pA*aB!40QMX}+a6aII?1jCT1VnVCO{Zhrw%mJN{U#hRqWk?5S z(C}=uZ?ujnw;A+x$KfBQDBDk6At~JyX}XygsY&C5a8|mPTwQ4RnCu7w4};W>u8V(W z8yMjxcV9Tc<3$7V95_@XcK)&vzgH1VU`H4VWYRj@GN$E%Nx^p5plp|51uH4%;OitujCGCWsVhYIee`dI0q>)bQ)0{ zvYSck2ZgJpE5nJ>2H(LaCL}*ZoO23N8{%D&+Rw$P5<2h@jOf@U{%IX_WRpZ7p$kv4NR=Hph7jH?TMK58sT6KkbItr|P zpi$#;<4Zn~1pH?X6{A~y$uLtIzd;!jr#cpe2rFeQ>soeL(i132j7986hU7S^l_DlQ zW|c#C7GhV&vtZ($v_`cYxN7CB_c0(SX# z1!Iiukk7@!XT)~NZkCwQ1^r1@pRys`+@av6j9iy}m4+tAfAGGnA_meNv0UOy4p90T zlA9NjU9kvAeRGi@2J94Bzm3KI^q6Pu(phT4(s!_n_G)dcBk|b+jr}o5-NtimHPtLc z(<# z>_+sZF_%gmPA&X$X-qEy)+Ba5mltwgdxYqLQ^eqJU^kF-V^S11Xb_aL2MN^r8ANZ@ zX?s7>xByc4Ab>u4s(E2r-Xdfp&qK(xa^aapTDf@AE0BBkpM?w6FV6yj`&=jk}ZP_ zd_fGGtB`zgXps{yf;R=9lmNiVK0F@t)l@>naC&|Q+Kpe;QfaTb~aJgO*F}6lm z++}IFG@!}W{l;kAT^E3PX#Yq&Q75h* zSV};G%3plD8U%4fQN|E7#^MD*sUt-Cj-=ZoS5A;<&9Jw)Kp5wV6;64ALKu5iAR(5E z`nSDmsp|t>sqF~5v?+vED8h&@>gyO1U8wlAM>?4K#%z$bx&70$qUkB6FkZJ4`zjFL zR>dWPl<$i?9QsEI0#tvl@IX{NadmU6IaRC<*VR-8qSyPX`g*oVP2>@jqe4!4pa-1Z zIhg0AEK+P3>!vpcByi5B*DFIn%v;HQvsuAU6KAmQsevah&3>~l$kv_I!G2vh|8EIw zvmo4~G%Y^L;N*}-N?+H7_Bvz%k#!4gf&$>srs@2PmgTkMY(vUM{l;oz9Lr=3r8W{HxA>+s`YoHgfJgqq(ZkVC) zG^o0g@@Dt;reuBqrq#?3&b*w$iLU#>Yq~;3DeIKNFo&mm@k$0ZG=OG;>)N#_3 z9N#at*ObauqjS*Xydoe$dgS!a;RrEDu^m`6p3=XqCCjpJpW$A3`T~lc`8o$J%CE4i z&U@44NUS-tRICKk@D_>5CiohC0u(gbA{Y92vs*JyobPUm&o35Rs22KqT3UyYO)xA( zy)88XUP$N@>Z@x3-6TNyp1&>&hrK`sml)-2z|dbaRpn0K2R~86q+_({aG}x}z61J( z;O*(isjgz#wyz*t;!XYN>&Z)v+7+Qr2XahP7nFe^L962v{rCc(Q761Eq|56Vz)NcN z>|cgzxvO(lBsCNU_yU<>vuD03f4NVf0t`=&39LcFr^{8pI|dGmSXg*L(2|?;-oFPx zSm6BI`eVx+(wds37Xtqna?{4IuXc-32D-#!e{I1Ki!lmE*SN(5{o zQymX6VpH5>a>%*wNg~>TM$U>ilMc+7R+7j3tmDW)JSg}?=w2-oWK;ldLs6Y1lw+x~ z!3yx-Kb+bZ`$iqQo1@ueGGY`SE8$?|cgXKZZSSW-hp&u=6GQ+KaA7&zm6{uyd@R5M2 zV_5gGgHc1MIby7kS=;Lb6hot_!C$4+H1}9LO+1Q;OUC%ou{w&VXeGGnJzS zdycT37`ku}^h9>r2P3`}$0FHj4YF|bb;oR$t_onZ8Xkg^1^&-a!8Hib)p95o>H?Zs z=X$9RR1YSG%SniGSTG`~_r`E7g{w90aYH;_^fKT7nj3h-sb{iqz^1s!n_8eX*}%WvNA0!!uNj=+!N_Hmu(9jav_`wWid z*k6>RB(XFJjB0!(?TZVyTDNw^nNtd8Z}`+eU~FgXQ)S>xvjt4dmm9pJD^1j1xxJh& zRodaO2Nct?KVo9b3G93f7`b2!uuPi08SAeMM$2K>KwNIYZ)^?EFW+J<`Rg(^nlO<1 z`OR+P>`IfA9Qg*XZJk^&6k8zT5XWgj4AL32mo2qef)-d2pvI|(;o|9N-DDIPhs;rA z-QC4gc$;VE$A7&Xz|p3ARzyR;oW>3Pu0ZF%FeXb(F3`;TdVlSb@Nyxv65?Dmf8BsBk>AXCcyLrI=1?HP zX8}b_JdyEh{*tlo9>g?ybHZah&Q_99Y9pb^%}ZiI0KY^gBdrMcdDTmcSe-V>7cp2d zc0QU<0J}F3_fkrPEvqqEH5swdIh@B;<0|Us8e81(dZ+4ZkF7t&z&hklF9Hvv*=7(` z9)0x7Y^;DT^}tZ)T2H%R0di}7du%a&aC{<|)-# zG~m<|Ql^4SFr6k3=c~apJ7a`DFdopCQmx9{jA2Rlg+Lg|6RT&vUki8jRxCRK$W(RL zXI(!f)j`*K&Yl0=jj+ZhOU+|JCEOECT1Juhi-K2T9huvW@759}7N4BH`WX|5Xg9+S zzO(bn;;3+>l)B(d7W(~jKW}Cd995MxlMc^ZoeqB$$I(T3#R7$o)caCy;%e(?Db{(vDhV@#g+T-??nX!4$W?wdmd!$oywsy&Aj z|K`Zd)Q$V~sh$7T*@KwK2SU*Jj*Y5NhhZ$q+;~2-HTH7B>8D+}t)hfmO@C@L8c4Cu zk6^PiqF0xfg?vedU$k;NlL$_G)`IXSWw*b^iOH{i3wP^5f;7|9muJ?s%<(KP@m`bG3jczzAzSnGX)XKLMIa(&T{P7Bhnas045y^>8V zhV669!|~K?z?SJFt0qlHl0tt9d$s`Q-Raja89pnsRaSWi6l9rE{UY7L_))AJffA8V zAo7$JNc9`K;Fqmg?POwiHJogM)UZ9)yfB)^rq^_(%63aDc(;m2#jPGJZXbhuYw=mfWnzW; z(*yu!T>Vv5g|9^;*OlX{v@9hjdrg;SFFvrV;7q{vr1;oN^~J5<4hA8^PnDjXfMAj~eujuP+{H3AYV3NQ0 zEA)SF;!0ShwJhqPbu@gFqKUh)m@HEgSnUG|&Da1qNe;=m%BKESoXyi==mvBn-x7rY zxLGllc}U@vmG(;0H1tUt8c3_tN_)tQI=K8U&Nhh%4S&PNpm_WVc{|x_Q%Au1hoMWS(yq|urxV+PgbJ*Jq%Kr;{2UoqAP#3zn^dxS0J89^!b z?4ri}UjyTEaQ^^lqn;{N6#wxyLz%qJ!hT91qanLD5;Y81N_+H{<6iBp3Gp>*=v}8C zjpE|8qHPEa#l}T`(Ss1XucA$LLA0t*P$w^l5Q)r^5GO4xI_u0sw2d^ThkhRSM6=P{ zedY8KZk(TER-O;TwmW6hTur)W-D`KPMFGTU?QY;RL60RN(v5K&!9DF~;y9b!kxWDf zQ_f*Rck^Y1!oYjd)6IM?wP%wsGI(6!NWHTaim`p&Ds?-WJGAj1l?g0#{vI)Fo91{P z&Le%&R$9N*9hlt+Yz&MsOsu=Vy{l&)P16Ug^bAGMsKhtJKpk9|a$7BCSaKRmh6j zAa>rD85?Bc3l9GTUpaIry!K0N%Y}bxn7bqb0Cn1y+HjwDa{9F8k8jme`W0hOT&Z65 z3!~3HPTWcTtO!YlPEH0KESwE1lauHg)CPyy(K>;2I6X2jEyHe)@lZVUmfN-751xx4 zICf|*XUyzCHTYY3Q-wj`1r4OYgRdR=Q{54T!!dt=2%C)O6&`8O8PHp^qmUSy(G+5M zf$*k>yaB$Jk7y0Yrk&q}`oMZlBk->~~#Oj<&+UKtS|%e#T4pqWKphmp`})N1I})#%zN|InLo#&k0D^!Ptk?1`td(u8^2~t5=o?kj)bvFF0c~Q&sk48WZO|HE!Jy=zIiYboT z=*{m}^lC;JCY$X>rsm+>wa~nKk66B_b1Rw9j95DP^r%DqhpucbvcPJDNVEKPDETc8 zR2A8j%}K0c`yFa?Z7g!eI|3y+m>9?1yT&P7ZQ3d#L}JL+)EIUN4=N zpynNJiWnC+QO@B`I!?1}=^iort{}`!72Vw^kqd&k%|Tt zOFuB7d?SbHF$wZr`xd|JS|5!_BCj%=!ux?u2}XhOlL6wMx>d5&Ub@{jC$a_r_M#18 zFmwDC%F=8P(RLTCuBRTZ)Hq#{oU2Z!Z&oz>*GPyPL3x*`r&ev!cEqT#;SfMHVqmxl5cC_uLPZEqyvtiMn2e7h;G9!^Dc&JzkLzmbU%K=#9KwR% z`mv4p3mCQw<}rw(($sww-@6}97V52+BD?eJN5~}c98iyWV=F3&2bho-BT+-JN70v1 z9}?DKvs#f)h58Nt(ZH7XkG2}6e$|J7ah*ru@(JonA*<|{7aELwQg~Ptnm|t)ww>53 zk>J1mh<1Oj4dXlv{wpXsYzR&Osah-NhRO8kkyElZvMquc zdYqxW=G4?y*aM}AbO-{382d2 zR1?gKWNu?q&Nb$T3wL!GBgAJt%bs@P9#52sYRAF&11`UxW7W7Lq}1Mmm1)ah*l>Ebtelp2|w$vTrq|RGy@yXWX61zQjAMX z(H$MCpgbEcFmv28TqwG}Jtz2QdgqeI(q9r!$)Y&KwPjTk$<3u}_B4zqPkyBfC_=1q zUZ)bev3a(Jx1zlZC8$O=pE=QlvOjNUEuT#zH`7n5SLh?C{#rzh8tT1OM;Wrimw%=3zztolQ?=jf}h@#Ts&nq}1VgVADtg3Et z17vf>yvb|T3~*VuIJK&xW|jHz3qUe%A%!02k*85f7Zi$^iZ9ecilJcW3oWaty!Aqr z!anh+lxg_bAKeYa^T@z}R7Z>I3jJRA%Pql-P*!ZW53gJp+Y z3q4lSy><$}`geuY+1cYaNtk9%)yQvOktceAB@J%L6+I=U`YI~o%k_@AGb%$#pyrWY z(trz2`^Tnevq`an`>WpsupCcM6OhJ+SSIFykcv)`Q)@HKNhui9onY&EB(d7NgP?Sz)dbTsc&cq_iPSVY~N^W zD5(;xs@VuYG4qO$geQ5gjPo(Y4W%?$(_s$;l4}zhY-i~{i>Dm3RMd^*X z;z?k0b{ZFTdKD7)ub9)+AsZeL({qQRl$Sga+9bW@DNF32XmC1<-B?`ASJCcV6>ku^ zH(#u|hk_a<=HRiHpsHlfHn3B6nlm~Uke+~iHGe^$7*AIsWNthH!Ipy8iF%J)Z$!P9 zIT&#`T2Ijfl}l0mKxcZ02l%FX*$mmen!wh-xU`ABCn{>jO~85UGz}E8t}VX64Gg|1 z!yD7|9DyoYiT0C5*T3an2{2FI+3rSyt=PaqZroA}y4`3|!tF5`0(cJ0GY%LFkI7_E z4vC>YG)LAJAc96KS2g!3yh+`Ck{Eq;P${azMi!wV2imew51%UVW0yvBxvZsZge?Hm zr?pmy=Ud?_QIwk1M1qc{Z{%H=B`l|uCeM)070@@)GSE&_D{WBTU6feAu8~>|pM=eV zspt;I-0I&#RmAlUhQyx}Op}FcZ1)&_6fbtSTJUSnm`<=xx#5&gdP)Duw$Qo^JYFQC z4{&^;fUH9Opv5(iRxCOJ7*6s~ILP~dN<8}kf?&E0L-TK!oNPtxpGM+0t}-emQG=#qdp*^8fw=S8>{kss^wQUl37v);5I!M< zvZJ$v*~Caoj8O=YY$?;xbAtD$S#TC=mb^Y62eczkZbMjj>$&&w2;-2m6_{N zs>shZIHW$avz-00&=Pz&nHOFuoN)7^wzeeJjsAFjyA@FZ&s=mH9`{x_Ec7=|GoJMixJqV3 z-m%U@yS$*l=~79QA0Vj;U50&mCX3#>90W7=sSfq+fKt&3tz(&(Ue5Hn4I*>eNMH+M z61&oSDR71ez1(USq73z_3(H*}QkkVpV5$9a@y#0#;`<>{WB`m_@8r4NO~9p7w>{qs zsII2H-x@Fb?Fc!0&2oJsj^xCwG{xmg{nE!5i`)G^qsg@w)L0b1|0?xg4E*us*R=Vl zPZoD+zs1Fq94pCQB=e7-t0hoaOiVfCiasBy>Mx*wgu*?wViH+GgK+sf%8$J&XVa2b zC2O1_JCQYhZGCcn>MQ%A_OX<6b@%sAvo{(1@s93=wsYX@Z0pB+zcG}9JD%1oT-KT{ zfWuG&zNsu_*chObHp7Z`GiW0rJ{!0Qgh+b)cz9!Uj>Ae!? z%Va^!&v?D6b8|h69zw9b0=BwHn$=GjK@5y}mPEGWy}DP4_%p6<9C?ZDqBEhrLox1t z*?v%`b1!;Rkd*XxDsr{qQ>RX15u6j1U`>LnGXUVkmq{yPhtfn}czm7XHi?_89qx5Z*RqG@&}e}B(PB?a^ny%s)-w;caT)m$*T*;1< zXKV$c^KVpt4z(Jk0dL70RiHS7=V!fZo>O(L1D4Z>4*Il*L(MZVMG0Z`4ad7}edJ#V z40Q%$0fG3yREykTM#ohL0$8}pz}{wza0 zN3hIof0%ZitqXS3KkJ7mCGW%%e3GD8J?#Ou2A={2e(yB-mwW!bQ|6yFg28Y>SLNc# z7|EZk*c#eulYm9Z5(>o$vZsYMtc&A*2THVDFkiz*h}VCKO_<3qp+m@Q06{ zq@A?d4`NNK+{PifZ4!5Aj-{ZN7 zB2cqN9Ab!!4`xja8O*0;i%5iJ(04NZSbk-k+%H@V`$vfv`6~FqjML>@*@}rrv%_b< z!zYpITg+0IR!Raj8=4cTXdU`0%Y6NLEHcE2{z?0jKya2`|CH;_YSiuO$!Bc`(@LMW zaiM-{zDD2suDyXXhj$TPz`d6z&^mjfJma=hGUD<%;t+M*S;c3t!K6xCgS?Rl5XiJt z>hRyyUh~Fr#Pt^8*PV)9t}F_SEazO6Z`K;a%Fxn+I5zDtJL(gR8Cvi-W{bigjS^)0 zv63}q#kZpVMcrn#af@$b&ke!Ci9x>Yamr(a(mX|vBaVNa;|bp`N+sH-HZ*tjxFIZy6l>%UG((qA*;9tS=nicoV5@)BTkSWb90 zFIK}{0Fd76tQqelgN??rNI)t)K2`@QEX54SgUY zaMTGhjSaTxpp(D1K9(-ZM*Q9Y`+pyu=&#R^M;D|Nrm+IR_Yo5~W)wQZSJRfv`(<7Efx2S|^0Xdd_I3--M(kzTL6g1L8kb+x)c@LLu$<|f1h#UAsw0o6?2xMW z)To!1_7D8O_g$lfkBN`VYb$Q=0*xuYhQWc23KO306x`kT>Z$G~t<*jte|I?SZ{nw@gS*Fn?C@xO%>~<@mJBPo)*L zOdfKwdE|%pW$|onX|iZN>v@HXGPH3OMM;2cOp1u`jp#FKo!LbON+4hO)%e2{B(XO3 zD$}I67IMp=oD-Kb&i_oGE{zeZ zf&o3HUWma#V;n{1YHRXcwSWwm-5H4!zcD0exwlU z%mZvrj&Bwp-YN_wA3BCB#ar}F65{F0s=_2K`p{qFE1#AUuhpMoNhvPxC^{DGxv6Re zkw2?$bP3WVCzKDNit=l(&X1kc#YZ`hN`#m6h-O);N8!16w72ZjFe5ouY-w{OEX z6Uev!zWxa)dO&OCu@N~tu{+B+_;OKq;;w9r^P~xLTr3r_JM4LINT&L17myEAIOdQ9 zz1A*pehj^MUEAT$*P86JbeqrC*k_)yq`U77uI~5Z!PF#Sci%GVi4Pax$KkD_*QRJ! zYn@WI!$K?LkQ=9Ua#ST$sTinF73PO?Ig~6{!x$*i4e$mOg7iP z(u9A*2g=2ny1pW0Wa=uv4L6ee{wji*j)JSG$#pM1pC!@Y1)GJy5n$l9t)^hMs}NP3 z3PApK{s2}US>^o3+FUBJXA%|Zh{<|TehHnJT38>{eI%0RBN(&>m01;Wft-uH)p7soOaj<0XzIfnovcl>&PdQsvKT8BmGQCoD=*=MU{jz`|W4^ZZhjPDVN%q>+3*oe%|Jv-~4z zmVvsqfFs}n4-{RVk0!C|pZzpR^0XgSGum(O=tP_Y0L}rvs~E!Bqa?AudH?;%Qw&ashJ~em!Gn=FW98pYA*B zU{k2iAeH(6u;Hjxq%&SAjxKd*esw5_&PnGm{c#ThVWuJgJjE969&?roml;g>sv9vP zm)V5SZ+*@sA!L&RJiq8o!zxxIx0~Pi2w;WbF7EW-gm2P#(fPbf#QsZug1>H&JhaVH z0)PA1^sFj8|Fl{RqgB}tA4qf&?Jk6}LNR2i$>PpE-p;h`>4^16FukP?xixC15*dH0PP6=>pg~j&Fzih{b*O z#De$SQQAU5sMePv$>;$bidOP6%-3DULDVf2MG1f3ND6C0K%sbMaVQj|Sam?3^&P`~ z63Lg@^&8JfkBCJvI616D5m~~&6>)Nuf3n+kLCk7cZ+z}%^{VmSBU0(4P1=WVp$4@B z@n=<6Se6XRhtDcgk=fd8K@T8g5pH3#7ihPblynfyd4p7U+He5&r+;NQ!L>DUd6)m9 z3ef}87RZ|c=~^NnZ%^LTv9v+mw3TZ5KL4CCK_Iw~Pc8!qNfN@MM!0p?|4L#ErySV; zU4y|=)A z+_^FZMaEvGjCiu%S=He&rJf=8uoqS?Q4lNpoH5sB!_{c;7nqq8sJQb{2<~-{@eSp$ zuH3G!h7SK>B}58oxp6aREx3KDD{Rk+`ebs8%_Vcxo>&mKJiviX~-{J|av$u-@V!$~MD*f`K_ zpN`%Wxq9ahjn}F)Gekka1;myY&7^!(( zfD2h0>~Qgqx^r5uCwiZo8<+%)sJw%mtg-9;pm~5@R*9|QWx0MDB=f-dL58b(;kpJi zJJIZ=WZ!3mf&Me-{4BG}bFtsWtjYi}2#`-6LdN|{#D--YsNf0fkh=o!c9oSVQh{}5 zK>K!IF3NKS_H zJ_&FlzwNetS7Y-^(n?1TOJeO4Esyxfvis&*39*bC8uD*rEtmg?sFm7*LZll;J z1eWxR7Sf3RuhpR;M>C!i*Me>Cb-O5p%x$-5Mm(->L0kB%X zrS&SU^R`Pr{ZlUD$gnr2X2|?}ZQ@7CX{-BZJ_LIh@h|{5@{v2|ItaR5-&fb!H&C{U zQg;-3-p>#SJ0!7*%$LfP-Av%$-)>aSZyVP}<9~1*?a%r)u#u&>00Q-2S;BH8V0Rt* z`3BNQnS`2Dgw>tBzr|Iy;mr$x$UcWYm1$tzl$Qr*eJ?0VTtc{?|EIREj;dpM8oam$ zcXtVH;X-hCcXxLS65JCYxVtC8JwTA)L4rHMJ-A!==H|V(`@Y||=j=bbbxz;v?y9M_ z>bX5tGu=7N!FX!to#-1-){griTm$;e)ZC8f)=$gyb3{UNJ0-D`c|I1Gqh*4fTyB{Z zyGskFw(<9Ap*vEcluX@F;5gX$JF->Ms@xr@P{OUZ3aJQg8VGx@R?N|4-;ka8|NPFR zV575<`6SKY8K7-zf_4rVa;K&eCb=tw-+%wx?p5Z>e}>N(&!( zJ|n(@b?REd;iQsRfK!S{`ms;Rn%@GgC6&Agt48liD@`o6eTQ-4U;2 z3CA4Ht=yedz_g|H`)BBjk&wnSy9L5nbQ|+b)ta29Io+{|m>mJ>U5^&0f?O`yQ!ll|O-j!n-jIf{73(hs z(B=z2H7X_Hg>Yn^9=-lL^Am=&k(OVFQX;{*e4Hevc(CIqR1C9#Fp&ns*poO4n6D)S zW=y?`4W8eIM4H5*mBmsOeN*FpRX8^i?RyLh%Ry5hqhWHmIoEp!;wmK?jd8$hG07eC z@r^wkwx!RueZXCN!;4+5dePuZ$V-r;dWV3PDv93km{3Snl8|^2z-IC(Y53e-oNl2~ zfQ&>mS6stYbahRvIcUr_Q^w*p?kdPJDHJOZl#>EQ%QIV>(C zFG0kh<41udu6V!Ugi;(LP4$s;ok=`fQE&eBh=-Q#;~%6;fPw~!=6OJTuFFTRZ|}T6 zC(hT*u>2?-Mv{~gcFbQi=MLKVRHUU}lB{LO)G`WP5G}Vnw*P?_{mKUI4%Y6RF2$Hh zF#guZW44CM-N08!3~kC_FTe`+(gHaTvrkm;MKnj&R7U--#j`(lyC$kF_eVQ9-wyiw zNaDf1QfG$JH+fTjF5+bUGf_in3|*zqlp(|61M2pgema)(n5Gm2MO}$I#j_o2@8PF*T{jtI;eq=a)_;h82_djIkT9Mu3qQQaDyuS%zU z$P%<*j$Gh~Hn+@~%GN?3X3OrI$)%~=0?h|)Mb{MdkFJ|x`wX|2)dj@hY4>@UF9=~6 z4faZ~h9rFJS&7drpAHwKt`I*J(VGA45* zI#=4f-=rpcV!|J;?#&Z*OE%RVi-uaI1%>dg0C~K==j%MtR`JGfst)JN6!FztMyNt& zsQbz;11~JYgir22KCb2D6s$6deK^l&KR5_p5;^@EqW(NymxwgLwM=)fF!`K^b*xrz z5>9Kr93D&u&3y)aI90C7&C`U#Mil?Rx~98XV%Pz8k^ar?(lB=4K8I<#?MdXk6;6tM zi1ktounE2b*7!FPs%nGJXZoaPeVaR$HPKa*P%$=sn;Z!viffjJ5WaW|4&Im9y z5l_330rRZ_!t-2tb1+K>lAyR>&b*_>qc2I3dpzH7bFx6Gm+Gm**P6~uCR4I4^U9P< ziEEA<3KUV3Ik_|5BcLiRdCauEu$#FpI+Uov@j*9zRapX<(Gx2yO@7j6*9ixgUV;cf zIWn0AmiH8{y{8X&3_~s?1IULr$BJv7uf)`eIOptXzT*W`P3YbS2>%)}sd#=Nv8~LL z4h99PUHhTAqM#^Q8kbN4PO5u+P$rtD?3)W;mp6ctw-G(M zASzYKYzM69%mA0C{{+P4S^C?crSBn6{19!*IP8Uy#K2J5qUus6-uJ7qsA<7z^mGH^ zF>P-@!2<@qiF{ z341^({=k12q?Cr|pY46&lU*`n+0|I)@NruE+jW^A2n5%WRmckZrsSBjG!_hg$E5bl zQ__(Xpg3#3-0X=ccal<4M{h9V{7DmLJdsc1%e`t6fRH=yqnG0o`tIgY!Lj4S*enyo zT;x?IQQC~n*;M4zAE&r$lgq<%=GFu@0TkpVR9=#Y{t*+?45igYTfEvGhwR8Cbr#ZQ`t#Yj*20yC%&#epN$!ui@9gJ3t!qUt2lc@ zwFS9;@j9CReWTP*rjJILGWf;*nx`y~`an*ab9|NWeReD?mpUGMo41XWd_*eeW~^4| zR|BmmZvO|}`bd`awjT`zh%Tt{qsUv`Bfn1Yc9TU@Hs5aWj`TV`5{plK>Q)Kn<-gv88=2c6pDl~+u7HxB7F%@Lvpl0VG zo}Lt`+FTz+UG3W4_q#|rQ5y?<95~SF9C~r~q5ch@+LCH`IoOG-UwmM1DM3VeJfcud0VQaF;T41h*lGa^*Yo_j zweiW9bIB3l;{vD~sJs+A{pUL-to?isZ%yAMMxUD9N$mTw1D3kI8#J|rG()dTj0TLU z2Fguyj=T91yS|!CIgsJ6iHan^zWLXuSPRCwpvGOJj7?IfVh<=%mLuvB^ zVQ8g;4F~hLt>&*BD4E@4E?g0|hx3A+_I9o6W4cEk=sD!nRCb4-kF%OilkW~=zudIt z9y7cz%=~yA9f zd+om0%~*CZ3cezEs9kp(M)M zSAhNcfH?N8%l(-mw!;`Ic^CXdH)$ zBdn;%pSLZIBEVe5fy`h{z}qsOyhF+!pzKbh)7ZY&OM5uW{9>oayTin9UPIxB%@=kv z;qDP}d`cHi%{x0Wg6hIFONB9dP1F*4*;6BfknsfB>yHJ5h#0YKg3nF@rb4jw&Lx!L z->PQ>1af>oqedP(j}$QAjIo=Xj2=e>ysS>{d-r@zu9A-%dm=?=smF`;tBvJRMm(3W z2>)#QNiov$triVJbG>xW+O$Jt{s-&E4 zPho7>HBVo>KiUfMQCgD~ZHC$m&b8#@(BDT{r@EUfe(Rs_T7#cm2+%>M2iB^AqFEV#xs*@>=P@i3QB5P*CDru~ zy7SmdFi_E$xeQJGb%YIrN#^->;eWe`qf4&);kq(T`{2J|A%k)l;g&UZYoaHSMGLyX z<{*m*7Yqz~5H5|G^)yU(&&QuU!l60CVh+$?>gD)!!Nv{+55trB*=3(cbcf>!FN0fN z()~%H=|O)jk(u(x6uWI1GTGwEctIL_B=no3TNz{zj#^ZuXD}F6ZhJNRsn2m-6q#@! zeNovnMl`;)v!3(3q+`M~`lp}BSUW~W!1C7n6Kg?qCt!CfUg9yu_s||`dW4naB)YQ) zbr+Vq##%&&O=CUg-4)Lht9T*3%WxoxJxz^HB4hr;qdV3NrM6|rfUR6b{ z&^=LE0FUdWCA5GGR8RjC4k@wdkR52&Rw62hXDEr?XM;98C1%E9j}K-CEM3K#$?7nG(hF8@g@Ycl@jz zgan;chwfW>X%#&5tImWmEb06j75QSFAID^0yY`Qis_v1e*DvSvB*qm7sm0i*4QbyS z+9xnz)+_v~{?hwVgH4`mcazU{vYWpiBqd^MdNz*?qesL}y-EK0)q$=-Sl=((PcXac za(cA#as%Z@R?pEa8%aq`#)LuIDjY3QVq=kxI?m^VMuPlkh2^ka#sdTl&g1wT0romck;Vhm9>cCx5~X3lWD=jUgNsL`$&#KbTVH-C?Xtzxd>y2V z7Z2ze&l_jnC2d)H^BQ%pNVNI2kDr|@d?ozClABMq*Ku*V@!r{i4c}^j{FSz3G`N|eb%`2wTJVV zmU2PAoad=P3cfQY?2F=jtQIe`&L+lcbou8p!u^jwF(>M>o;*O@s4|N4wf5>>8K;3Y z10E|Dnl>{IepkQnVoVRaJ29uUWP{(Ahnp2K$Z}QP<^Ei1I4Y#Z*z={{$6zqNH2ocy zZwJ$82AncX%&}V}*EO@|UEr{vrBFM^_u;DQm+NPATuvhrRBvnH9l-N6T?|>WcL5KE z|MVPPAb2OkOQz!Nn0R7UjbA~trLJrHyQPBuKD6oX>ZZ{`^B`>h2uzQHC60={ym;d4 zat1x(398f(Ns7i53hRw+!K-C6PJKgoYdNRj7LG_ZO5ITtN&a#YR2oEP3^h>9tn#S$ zg^&}T3g|P^b>Fdl_vV}8L}PC*H?5NOM{1|tgBU*%UPGfgW;VvpTECNrWrrdfF&-*! z8P@h3qek5hkZGR3@i5D<8;h_-Ex!kGTXRK)%j%a=<(BogLQ}zDNAyoPo0B8npF5) zo(m_s4_z_vvnhK}Cr{&bWItHOo;hRJUu3j{`|WkAEJErt>B7U!DY>{OOS4L;UES>D z?N9}ri+bzhm(RSfuW2q4C`N^LxbQMW*ljCyL%x{7N#f{zP&QnTJc7@?aWl}gamb(W zOd<%~Qf57R6KZ^pDrhr|)T*`h#_~J5&JN~_^>`7Ahut(jW@J*dk?Rh10p7JrLE?P0 z(3>KDwif>bC<0B)1TiKH$Dy{iD5}>t#i$*Rq{bTunG3FlO}2&Ij^b~HP-}Hl6Eap( z;*ns;2DwGzRR|6vU>nb`G9rZw!CpoD9q*Q8-ilDDSlkCLqPaHA(6S3beG7Gs@{}$O zAlZ$x8O<`4++BK5xDwG{Hds@Wo31I>dKDr{L@haZs8MYulWU4K*E$=VdGpEOm`r2E z649wE{TY{cTEGSgr!Bgo7*2$oJYX-J$U6zm86nhc8A?oTTZ%+_73P(8c3KmimW(^v zM^`G!rOpNs+T%NOi{l;-NxAPr0#{lT1R-*g!q7jYx9+ftqBTmf!!b(^89R#~Dc^bR z@{mq^ffD=zZvM#Y;v|_EMEB^N=YPU=MY)v*N8MYkol)DmFP-^^mX*}%4*81)>d8M4#EWdyF7TPs`U!{ zs)%Xj<1zAo)_j4vq;9T1CGVqwZI6aH#kivu_1jgcVlb;7PEBP_BGpl7oUe1|J!+au zm-y9gyQ=B{wL(~722mb66;*LN#qk&U=wxv^rZ*(4_(#^V2vHTm@Mh} zkO7BDTkKC^dq9>rw9+pc;RP)m#8*wfNl=CPvtT<6xi#fx`m~N{D3iy$7uWZSye7lj zpD~P8mJ_FMT*0Xd-Qd3N%$2Vzbd%T0KED5qTVMgWZ4?7mtAG_8jau|-!nBtL4L`@W z<^@#akXS4BzvPxzZoGRy9BI}Ej<;RIX^pcnW?#u-$II4$MnvVWFjRWU#Q2up7mFin zow14WYxmqN>wH04aiv3JLwww#Cbk>fZw^JyiN0fIY5l$*XGFT6qVNdkOikO1Xai(* zxK)0%l_zidO%UX#<@wavX9dk-w znww0nh9ucN&9SxY`g>Xlycjp$>BGn5n&65wjwd}_?du+hrBXIZ$2s&=%7Hw%(1r!C zKkrpd8m}GcGZI`Dgtw;J;>fXlBGZk@Vj>o)IMaXGm*sE^3h1wPCNe{^n0=cE8@s&B z9?rP8iVUsg$;iEnhAi>LXg`&zcYvVz*LB7>PAOkP298FOYMj}yBVwww`_Py89W+%- zlaUeucb2&`_3aFl+8((A*3w@0u_Pa{FP}UMUmbRpT0e;*zc}AToN6Z;%3+1>Wv?vb zU`w1!fHIgA9VMDIOQ3rT_QU7DS~n|h?%8;?Mx3ncqe(u+sIkKsPcjqB1dRxj4ztdB z$`$oql9|Xh)GbUpblE~?Bk?V*BbUZA(Wpq`PE_2Zq5N*%%&>(wLA#_)^Hl4b*%o0O zzi4>Z6zYD1c>LT^*2Aj6cgczEcIzMlolWt;hDcJhD6QZg|nD{&Jx|VTt zs@~Pa6Ou`?nTs#jMFjz~o(Hf`JVgNZiG8fBrt?bVWFo=#*ggs>nd`3ey;eIB%=w`H zb>|Z9t#^gGk2)6q@|ASlEfPGWXh}@7L7L*BwY}#~S;Q;v*Y3FBW;IiWjIQlrN^23k z0>bNF-z#4b@N*7I^NwvPAZlBXWADcu(BL~%F}{)2Fdo05zNA#9q44Tl*3WztC;WN0 z@KvI+^YHQ{b<1tUcmf6V1Ic!i)fU&K-?tq~-R64t+Fm6)66J&2)bl6@ya@M88@jQ~ z@MwL0Wf|}{j{ATtV@eaHlPkCxLdMH5H`os(!4XGtSR32Ic}>CeM7uxR_55$WFm5RA zS9~<>+Yr0N&Xc=~!&?!?CcDxHRKN9YSs^htKvU5gKhnKno3QR<(|$xpaz9R7%+N6x z^SnlI&FCGbL*#%GI_Aa)=2Ze<+6XiSOdI9xEZ}7A2w#%-Z3@;wk3|7GqCB?s3G#JWTPx5n#HG&dy!{r-P%H zjRk=J1u%e_97hEN1!%>;a{qe*ApAEX2qf}v-v1eg222NG9&V=g0Lj3^{!f`O|6==F z8c^jM%+c1#3W(r%IsOs)cYPqN zAR3FmXe1VHmJl8YKzCm98A$N}QpKSxK&a2_AOdRuS1gEvBElfPNLa`ZgMdn@4ipI9 z=5TYEs_W_Y79s)1%){-kK7}T~1wjG`MBwi6r@tX|NZ~}lHr|jp4$ut9I6wfB4+-<; z|C^WauRLH({?H)(4&nJH4M{@Me`Eoe;6Kv;H~#?ge;q01T)L%pFi5Fb3w%e-{j? z6FU%0JuN&;0j%q2`3FPDl>gRqU~agXy0}18@qYoc?O&6L=A(x*L?DTU>7R=sLO}|q PURJL-Sb14F*xCOJAC9)u literal 0 HcmV?d00001 diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 6572616769a9..6ed21de368ac 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -18,6 +18,7 @@ pytestmark = pytest.mark.cpu_test +ASSETS_DIR = Path(__file__).parent / "assets" NUM_FRAMES = 10 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) @@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str): ) assert np.sum(np.isnan(sim)) / sim.size < 0.001 assert np.nanmean(sim) > 0.99 + + +def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): + """ + Regression test for handling videos with broken frames. + This test uses a pre-corrupted video file (assets/corrupted.mp4) that + contains broken/unreadable frames to verify the video loader handles + them gracefully without crashing and returns accurate metadata. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv") + + # Load the pre-corrupted video file that contains broken frames + corrupted_video_path = ASSETS_DIR / "corrupted.mp4" + + with open(corrupted_video_path, "rb") as f: + video_data = f.read() + + loader = VIDEO_LOADER_REGISTRY.load("opencv") + frames, metadata = loader.load_bytes(video_data, num_frames=-1) + + # Verify metadata consistency: + # frames_indices must match actual loaded frames + assert frames.shape[0] == len(metadata["frames_indices"]), ( + f"Frames array size must equal frames_indices length. " + f"Got {frames.shape[0]} frames but " + f"{len(metadata['frames_indices'])} indices" + ) + + # Verify that broken frames were skipped: + # loaded frames should be less than total + assert frames.shape[0] < metadata["total_num_frames"], ( + f"Should load fewer frames than total due to broken frames. " + f"Expected fewer than {metadata['total_num_frames']} frames, " + f"but loaded {frames.shape[0]} frames" + ) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 369c5e6cb4d1..5c75bee54dd3 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -63,6 +63,63 @@ def load_bytes( ) -> tuple[npt.NDArray, dict[str, Any]]: raise NotImplementedError + @staticmethod + def _read_frames( + cap, + frame_indices: set[int], + num_expected_frames: int, + max_frame_idx: int, + ) -> tuple[npt.NDArray, int, list[int]]: + import cv2 + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8) + + i = 0 + valid_frame_indices = [] + for idx in range(max_frame_idx + 1): + ok = cap.grab() + if not ok: + # Frame is broken/unreadable, log warning + if idx in frame_indices: + logger.warning( + "Failed to grab frame %d during video loading. " + "This frame will be skipped.", + idx, + ) + continue + if idx in frame_indices: + ret, frame = cap.retrieve() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + valid_frame_indices.append(idx) + i += 1 + else: + # retrieve() failed even though grab() succeeded + logger.warning( + "Failed to retrieve frame %d during video loading. " + "This frame will be skipped.", + idx, + ) + + valid_num_frames = len(valid_frame_indices) + if valid_num_frames < num_expected_frames: + logger.warning( + "Video loading completed with %d broken/unreadable frames. " + "Expected %d frames but only loaded %d frames.", + num_expected_frames - valid_num_frames, + num_expected_frames, + valid_num_frames, + ) + + assert i == valid_num_frames, ( + f"Expected reading {valid_num_frames} frames, " + f"but only loaded {i} frames from video." + ) + + return frames[:valid_num_frames], valid_num_frames, valid_frame_indices + VIDEO_LOADER_REGISTRY = ExtensionManager() @@ -120,24 +177,10 @@ def load_bytes( ) frame_idx = uniform_sampled_frames.tolist() - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8) - - i = 0 - for idx in range(max(frame_idx) + 1): - ok = cap.grab() - if not ok: - break - if idx in frame_idx: - ret, frame = cap.retrieve() - if ret: - frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - i += 1 - - assert i == num_frames_to_sample, ( - f"Expected reading {num_frames_to_sample} frames, " - f"but only loaded {i} frames from video." + # Convert to set for O(1) lookup performance + frame_idx_set = set(frame_idx) + frames, valid_num_frames, valid_frame_indices = cls._read_frames( + cap, frame_idx_set, num_frames_to_sample, max(frame_idx) ) # Use transformers transformers.video_utils.VideoMetadata format @@ -148,10 +191,10 @@ def load_bytes( "fps": original_fps, "duration": duration, "video_backend": "opencv", - "frames_indices": list(frame_idx), + "frames_indices": valid_frame_indices, # extra field used to control hf processor's video # sampling behavior - "do_sample_frames": num_frames_to_sample == total_frames_num, + "do_sample_frames": valid_num_frames == total_frames_num, } return frames, metadata @@ -185,10 +228,10 @@ def load_bytes( # Refer to: # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 - frame_indices: range | list[int] + frame_indices_list: list[int] if duration <= max_duration: n = int(math.floor(duration * fps)) - frame_indices = sorted( + frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(i * original_fps / fps))) for i in range(n) @@ -197,34 +240,23 @@ def load_bytes( else: num_samples = int(max_duration * fps) if num_samples >= total_frames_num: - frame_indices = range(total_frames_num) + frame_indices_list = list(range(total_frames_num)) else: target_seconds = np.linspace(0, duration, num_samples, endpoint=True) - frame_indices = sorted( + frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(t * original_fps))) for t in target_seconds } ) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8) - - i = 0 - for idx in range(total_frames_num): - ok = cap.grab() - if not ok: - break - if idx in frame_indices: - ret, frame = cap.retrieve() - if ret: - frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - i += 1 - - assert i == len(frame_indices), ( - f"Expected reading {len(frame_indices)} frames, " - f"but only loaded {i} frames from video." + # Convert to set for O(1) lookup performance + frame_indices_set = set(frame_indices_list) + frames, valid_num_frames, valid_frame_indices = cls._read_frames( + cap, + frame_indices_set, + len(frame_indices_list), + total_frames_num - 1, ) # Use transformers transformers.video_utils.VideoMetadata format @@ -233,7 +265,7 @@ def load_bytes( "fps": original_fps, "duration": duration, "video_backend": "opencv_dynamic", - "frames_indices": list(frame_indices), + "frames_indices": valid_frame_indices, "do_sample_frames": False, } From 64192d562402a56dc1e3a2141cfe896a7f0b52e9 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 20 Nov 2025 13:23:22 +0800 Subject: [PATCH 225/578] [Bugfix] Revert custom attention mask for gemma3-mm (#28995) Signed-off-by: Isotr0py Co-authored-by: Cyrus Leung --- vllm/config/model.py | 5 - vllm/model_executor/models/gemma3_mm.py | 138 +----------------------- vllm/transformers_utils/config.py | 11 -- vllm/v1/worker/gpu_model_runner.py | 19 ---- 4 files changed, 1 insertion(+), 172 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index d1e56a72a318..97cba6ea7295 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,7 +32,6 @@ try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, - uses_custom_attention_masks, uses_mrope, ) from vllm.transformers_utils.gguf_utils import ( @@ -1625,10 +1624,6 @@ def uses_alibi(self) -> bool: def uses_mrope(self) -> bool: return uses_mrope(self.hf_config) - @property - def uses_custom_attention_masks(self) -> bool: - return uses_custom_attention_masks(self.hf_config) - @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index fe83c8b63b01..43c69e5e1399 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -596,7 +596,7 @@ def _process_image_input( def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return [] @@ -644,142 +644,6 @@ def forward( return hidden_states - def generate_attention_masks( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - mask_dtype: torch.dtype, - ) -> dict[str, Any]: - """Generate custom attention masks for Gemma3 multimodal inputs. - - This is called by V1 engine's gpu_model_runner during preprocessing - to generate attention masks that allow bidirectional attention between - image tokens while maintaining causal attention for text. - """ - # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. - # This is a HACK. Fix this. - start_indices = (positions == 0).cpu().nonzero() - num_seqs = len(start_indices) - seq_lens = [] - for i in range(num_seqs): - start_idx = start_indices[i] - end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids) - seq_lens.append(end_idx - start_idx) - - global_attn_masks = [] - local_attn_masks = [] - start_idx = 0 - for seq_idx, seq_len in enumerate(seq_lens): - end_idx = start_idx + seq_len - input_token_ids = input_ids[start_idx:end_idx] - - # Find image token positions - img_pos = input_token_ids == self.config.image_token_index - - start_idx = end_idx - - # Create a global causal mask - global_attn_mask = torch.empty( - 1, - 1, - seq_len, - seq_len, - dtype=mask_dtype, - device=input_ids.device, - ) - global_attn_mask.fill_(float("-inf")) - # Fill the lower triangle with 0 (causal attention) - global_attn_mask = global_attn_mask.triu(diagonal=1) - - # Enable bidirectional attention between image tokens - img_mask = torch.zeros_like(global_attn_mask) - img_mask[:, :, :, img_pos] += 1 - img_mask[:, :, img_pos, :] += 1 - global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) - global_attn_masks.append(global_attn_mask) - - # GGUF compatibility: config might be Gemma3TextConfig directly - text_config = getattr(self.config, "text_config", self.config) - sliding_window = text_config.sliding_window - if sliding_window is not None: - # Create a local causal mask with sliding window (1024) - local_attn_mask = torch.ones_like(global_attn_mask) - local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window) - local_attn_mask = torch.where( - local_attn_mask == 0, global_attn_mask, float("-inf") - ) - local_attn_masks.append(local_attn_mask) - - return { - "has_images": True, - "seq_lens": seq_lens, - "global_attn_masks": global_attn_masks, - "local_attn_masks": local_attn_masks, - } - - def prepare_attn_masks( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - mask_dtype: torch.dtype, - **kwargs, - ): - kwargs["has_images"] = True - # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. - # This is a HACK. Fix this. - start_indices = (positions == 0).cpu().nonzero() - num_seqs = len(start_indices) - seq_lens = [] - for i in range(num_seqs): - start_idx = start_indices[i].item() - if i < num_seqs - 1: - end_idx = start_indices[i + 1].item() - else: - end_idx = len(input_ids) - seq_lens.append(end_idx - start_idx) - kwargs["seq_lens"] = seq_lens - - global_attn_masks = [] - local_attn_masks = [] - start_idx = 0 - for seq_len in seq_lens: - end_idx = start_idx + seq_len - input_token_ids = input_ids[start_idx:end_idx] - start_idx = end_idx - # Create a global causal mask. - global_attn_mask = torch.empty( - 1, - 1, - seq_len, - seq_len, - dtype=mask_dtype, - device=input_ids.device, - ) - global_attn_mask.fill_(float("-inf")) - # Fill the lower triangle with 0. - global_attn_mask = global_attn_mask.triu(diagonal=1) - - # Consider the bidirectional attention between image tokens. - img_mask = torch.zeros_like(global_attn_mask) - img_pos = input_token_ids == self.config.image_token_index - img_mask[:, :, :, img_pos] += 1 - img_mask[:, :, img_pos, :] += 1 - global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) - global_attn_masks.append(global_attn_mask) - - sliding_window = self.config.text_config.sliding_window - if sliding_window is not None: - # Create a local causal mask with sliding window (1024). - local_attn_mask = torch.ones_like(global_attn_mask) - local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window) - local_attn_mask = torch.where( - local_attn_mask == 0, global_attn_mask, float("-inf") - ) - local_attn_masks.append(local_attn_mask) - kwargs["global_attn_masks"] = global_attn_masks - kwargs["local_attn_masks"] = local_attn_masks - return kwargs - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4ca155af03dc..df24738477e7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -520,17 +520,6 @@ def is_interleaved(config: PretrainedConfig) -> bool: return False -def uses_custom_attention_masks(config: PretrainedConfig) -> bool: - """Detect if model uses custom attention mask generation for multimodal. - - Some multimodal models require custom attention masks that enable - bidirectional attention between image tokens while maintaining causal - attention for text tokens. Currently applies to Gemma3 multimodal models. - """ - architectures = getattr(config, "architectures", []) - return "Gemma3ForConditionalGeneration" in architectures - - def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str): """ Update kwargs for AutoConfig initialization based on model_type diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 80f8344d4410..0490ed39c8c7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -324,7 +324,6 @@ def __init__( # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - self.uses_custom_attention_masks = model_config.uses_custom_attention_masks self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( model_config ) @@ -2352,24 +2351,6 @@ def _preprocess( **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } - - # Generate custom attention masks for models that require them. - # V1 pre-generates embeddings, so forward() skips prepare_attn_masks(). - # Check mm_features (mm_embeds is empty during decode). - has_mm_features = any( - req_state.mm_features for req_state in self.requests.values() - ) - if ( - self.uses_custom_attention_masks - and has_mm_features - and hasattr(self.model, "generate_attention_masks") - ): - mask_kwargs = self.model.generate_attention_masks( - self.input_ids.gpu[:num_scheduled_tokens], - self.positions.gpu[:num_scheduled_tokens], - mask_dtype=self.model.dtype, - ) - model_kwargs.update(mask_kwargs) elif self.enable_prompt_embeds and is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. From a9705a290af05ad71023714074ad8bf1a50c60a3 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 20 Nov 2025 06:04:23 +0000 Subject: [PATCH 226/578] [Model][QwenVL] Replace `torch.repeat_interleave` with faster `np.repeat` (#28964) Signed-off-by: Lukas Geiger --- .../models/multimodal/generation/test_qwen2_vl.py | 14 ++------------ vllm/model_executor/models/qwen2_vl.py | 15 +++++++++------ vllm/model_executor/models/qwen3_vl.py | 12 +++++++----- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e10b8e1e77af..e1b7dbf99f1f 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -128,12 +128,7 @@ def get_image_embeds(model): visual = model.visual pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to( - visual.device, dtype=torch.int64 - ) - return visual( - pixel_values_on_device, grid_thw=image_grid_thw_on_device - ).cpu() + return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu() image_embeds = torch.concat(llm.apply_model(get_image_embeds)) @@ -217,12 +212,7 @@ def get_image_embeds(model): visual = model.visual pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to( - visual.device, dtype=torch.int64 - ) - return visual( - pixel_values_on_device, grid_thw=video_grid_thw_on_device - ).cpu() + return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu() video_embeds = torch.concat(llm.apply_model(get_image_embeds)) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d25ff2785bfe..479a7871e364 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -29,6 +29,7 @@ from functools import partial from typing import Annotated, Any, Literal, TypeAlias +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -751,25 +752,27 @@ def forward( if isinstance(grid_thw, list): grid_thw_list = grid_thw - grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + grid_thw = np.array(grid_thw, dtype=np.int32) else: grid_thw_list = grid_thw.tolist() + grid_thw = grid_thw.numpy() # compute position embedding rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) # compute cu_seqlens - cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] - ).cumsum(dim=0, dtype=torch.int32) - cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) - cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype=np.int32 + ) + cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + cu_seqlens = torch.from_numpy(cu_seqlens) # transformers x = x.unsqueeze(1) # pre-compute seqlens for attn mask to reduce cuMemcpy operations max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) for blk in self.blocks: x = blk( x, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c10aeaec5ab8..90c4894d33e8 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -553,18 +553,20 @@ def forward( if isinstance(grid_thw, list): grid_thw_list = grid_thw - grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + grid_thw = np.array(grid_thw, dtype=np.int32) else: grid_thw_list = grid_thw.tolist() + grid_thw = grid_thw.numpy() pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] - ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) + cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype=np.int32 + ) + cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + cu_seqlens = torch.from_numpy(cu_seqlens) hidden_states = hidden_states.unsqueeze(1) max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) From 1c7bcc55b86d6cb867072dfb890dec6c7e747a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Wed, 19 Nov 2025 23:20:12 -0700 Subject: [PATCH 227/578] [Frontend] Allow parsed tool arguments (#28820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/entrypoints/chat_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3b722c2d9277..03214c4d131b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1437,7 +1437,8 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: for item in message["tool_calls"]: # if arguments is None or empty string, set to {} if content := item["function"].get("arguments"): - item["function"]["arguments"] = json.loads(content) + if not isinstance(content, (dict, list)): + item["function"]["arguments"] = json.loads(content) else: item["function"]["arguments"] = {} From 20e4497be23f8e74882bfb0bd0db3d30dd821afc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 20 Nov 2025 14:39:10 +0800 Subject: [PATCH 228/578] [V0 Deprecation] Remove `num_lookahead_slots` (#29000) Signed-off-by: DarkLight1337 Co-authored-by: Michael Goin --- vllm/config/scheduler.py | 9 --------- vllm/config/speculative.py | 10 ---------- vllm/engine/arg_utils.py | 11 ----------- 3 files changed, 30 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 8194295ffedb..b6078706daac 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -62,15 +62,6 @@ class SchedulerConfig: """For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.""" - num_lookahead_slots: int = Field(default=0, ge=0) - """The number of slots to allocate per sequence per - step, beyond the known token ids. This is used in speculative - decoding to store KV activations of tokens which may or may not be - accepted. - - NOTE: This will be replaced by speculative config in the future; it is - present to enable correctness tests until then.""" - enable_chunked_prefill: bool = True """If True, prefill requests can be chunked based on the remaining `max_num_batched_tokens`. diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 13a8632413d9..a0c65b6049e1 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -634,16 +634,6 @@ def _verify_args(self) -> Self: return self - @property - def num_lookahead_slots(self) -> int: - """The number of additional slots the scheduler should allocate per - step, in addition to the slots allocated for each known token. - - This is equal to the number of speculative tokens, as each speculative - token must be scored. - """ - return self.num_speculative_tokens - def use_eagle(self) -> bool: return self.method in ("eagle", "eagle3", "mtp") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 68205b6079d7..74828bc109cb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -488,7 +488,6 @@ class EngineArgs: ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override - num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns") @@ -1081,9 +1080,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--long-prefill-token-threshold", **scheduler_kwargs["long_prefill_token_threshold"], ) - scheduler_group.add_argument( - "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"] - ) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument( @@ -1653,18 +1649,11 @@ def create_engine_config( target_parallel_config=parallel_config, ) - # make sure num_lookahead_slots is set appropriately depending on - # whether speculative decoding is enabled - num_lookahead_slots = self.num_lookahead_slots - if speculative_config is not None: - num_lookahead_slots = speculative_config.num_lookahead_slots - scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, - num_lookahead_slots=num_lookahead_slots, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, From 7218f83992c7d61fc3845ea24407a1f3b909713e Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Thu, 20 Nov 2025 15:50:23 +0800 Subject: [PATCH 229/578] [ROCm][BugFix] Fix shared expert loading error when disable `VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS` (#28633) Signed-off-by: ganyi --- vllm/model_executor/models/deepseek_v2.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c50fc327e760..d0a116b97997 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -287,7 +287,10 @@ def __init__( ) self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled: + self.is_fusion_moe_shared_experts_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + ) + if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled: self.shared_experts = None else: intermediate_size = config.moe_intermediate_size * config.n_shared_experts @@ -327,7 +330,7 @@ def __init__( num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, n_shared_experts=config.n_shared_experts - if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + if self.is_fusion_moe_shared_experts_enabled else None, ) From 1e1c06789e63a760d91aaf6e4ddfeabfe382c301 Mon Sep 17 00:00:00 2001 From: Bradley D Date: Wed, 19 Nov 2025 23:53:38 -0800 Subject: [PATCH 230/578] [ci][amd] fix EPLB execution test (#28742) Signed-off-by: Bradley Davis --- tests/distributed/test_eplb_execute.py | 423 +++++++++++++------------ 1 file changed, 213 insertions(+), 210 deletions(-) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 7b45ae82c72d..0a97749ac318 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import multiprocessing import os import random import pytest import torch import torch.distributed +import torch.multiprocessing as mp from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( @@ -17,10 +17,12 @@ ) from vllm.utils.system_utils import update_environment_variables +mp.set_start_method("spawn", force=True) -def distributed_run(fn, world_size): + +def distributed_run(fn, world_size, *args): number_of_processes = world_size - processes: list[multiprocessing.Process] = [] + processes: list[mp.Process] = [] for i in range(number_of_processes): env: dict[str, str] = {} env["RANK"] = str(i) @@ -29,7 +31,7 @@ def distributed_run(fn, world_size): env["LOCAL_WORLD_SIZE"] = str(number_of_processes) env["MASTER_ADDR"] = "localhost" env["MASTER_PORT"] = "12345" - p = multiprocessing.Process(target=fn, args=(env,)) + p = mp.Process(target=fn, args=(env, world_size, *args)) processes.append(p) p.start() @@ -40,24 +42,16 @@ def distributed_run(fn, world_size): assert p.exitcode == 0 -def worker_fn_wrapper(fn): - # `multiprocessing.Process` cannot accept environment variables directly - # so we need to pass the environment variables as arguments - # and update the environment variables in the function - def wrapped_fn(env): - update_environment_variables(env) - local_rank = os.environ["LOCAL_RANK"] - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - init_distributed_environment() - - # Ensure each worker process has the same random seed - random.seed(42) - torch.manual_seed(42) - - fn() +def set_env_vars_and_device(env: dict[str, str]) -> None: + update_environment_variables(env) + local_rank = os.environ["LOCAL_RANK"] + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + init_distributed_environment() - return wrapped_fn + # Ensure each worker process has the same random seed + random.seed(42) + torch.manual_seed(42) def create_expert_indices_with_redundancy( @@ -275,6 +269,79 @@ def verify_redundant_experts_have_same_weights( ) +def _test_rearrange_expert_weights_with_redundancy( + env, world_size, num_layers, num_local_experts, num_logical_experts +) -> None: + # Initialize model parallel (using tensor parallel as an entrypoint + # to expert parallel) + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") + + # Test parameters + total_physical_experts = world_size * num_local_experts + hidden_sizes = [32, 64] # Two different weight matrices + + # Create old expert indices (with redundancy) + redundancy_config = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + + old_indices = create_expert_indices_with_redundancy( + num_layers, + num_logical_experts, + total_physical_experts, + redundancy_config, + ) + + # Create new expert indices (with redundancy) + new_redundancy_config = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + new_indices = create_expert_indices_with_redundancy( + num_layers, + num_logical_experts, + total_physical_experts, + new_redundancy_config, + ) + + # Create expert weights + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices + ) + + # Execute weight rearrangement + rearrange_expert_weights_inplace( + old_indices, + new_indices, + expert_weights, + ep_group, + is_profile=False, + ) + + # Verify the rearrangement result + verify_expert_weights_after_shuffle( + expert_weights, + new_indices, + hidden_sizes, + ep_rank, + num_local_experts, + ) + + verify_redundant_experts_have_same_weights( + expert_weights, + new_indices, + hidden_sizes, + world_size, + num_local_experts, + ) + + @pytest.mark.parametrize( "world_size,num_layers,num_local_experts,num_logical_experts", [ @@ -305,78 +372,69 @@ def test_rearrange_expert_weights_with_redundancy( if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") - - @worker_fn_wrapper - def worker_fn(): - # Initialize model parallel (using tensor parallel as an entrypoint - # to expert parallel) - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") - - # Test parameters - total_physical_experts = world_size * num_local_experts - hidden_sizes = [32, 64] # Two different weight matrices - - # Create old expert indices (with redundancy) - redundancy_config = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - - old_indices = create_expert_indices_with_redundancy( - num_layers, - num_logical_experts, - total_physical_experts, - redundancy_config, - ) - - # Create new expert indices (with redundancy) - new_redundancy_config = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - new_indices = create_expert_indices_with_redundancy( - num_layers, - num_logical_experts, - total_physical_experts, - new_redundancy_config, - ) - - # Create expert weights - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices - ) - - # Execute weight rearrangement - rearrange_expert_weights_inplace( - old_indices, - new_indices, - expert_weights, - ep_group, - is_profile=False, - ) - - # Verify the rearrangement result - verify_expert_weights_after_shuffle( - expert_weights, - new_indices, - hidden_sizes, - ep_rank, - num_local_experts, - ) - - verify_redundant_experts_have_same_weights( - expert_weights, - new_indices, - hidden_sizes, - world_size, - num_local_experts, - ) - - distributed_run(worker_fn, world_size) + distributed_run( + _test_rearrange_expert_weights_with_redundancy, + world_size, + num_layers, + num_local_experts, + num_logical_experts, + ) + + +def _test_rearrange_expert_weights_no_change(env, world_size) -> None: + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") + + num_layers = 2 + num_local_experts = 2 + total_physical_experts = world_size * num_local_experts + num_logical_experts = total_physical_experts // 2 # Some redundancy + hidden_sizes = [32, 64] + + # Create redundancy configuration + redundancy_config = [2] * num_logical_experts + + # Same indices - no change + indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, redundancy_config + ) + + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices + ) + + # Save original weights + original_weights = [] + for layer_weights in expert_weights: + layer_copy = [] + for weight in layer_weights: + layer_copy.append(weight.clone()) + original_weights.append(layer_copy) + + # Execute rearrangement (should be no change) + rearrange_expert_weights_inplace( + indices, + indices, # Same indices + expert_weights, + ep_group, + is_profile=False, + ) + + # Verify that the weights have not changed + for layer in range(num_layers): + for weight_idx in range(len(hidden_sizes)): + torch.testing.assert_close( + expert_weights[layer][weight_idx], + original_weights[layer][weight_idx], + msg=f"""Layer {layer}, weight {weight_idx} + should remain unchanged""", + ) @pytest.mark.parametrize("world_size", [2, 4]) @@ -388,62 +446,69 @@ def test_rearrange_expert_weights_no_change(world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") + distributed_run(_test_rearrange_expert_weights_no_change, world_size) - @worker_fn_wrapper - def worker_fn(): - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") - - num_layers = 2 - num_local_experts = 2 - total_physical_experts = world_size * num_local_experts - num_logical_experts = total_physical_experts // 2 # Some redundancy - hidden_sizes = [32, 64] - - # Create redundancy configuration - redundancy_config = [2] * num_logical_experts - - # Same indices - no change - indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, redundancy_config - ) - - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices - ) - - # Save original weights - original_weights = [] - for layer_weights in expert_weights: - layer_copy = [] - for weight in layer_weights: - layer_copy.append(weight.clone()) - original_weights.append(layer_copy) - - # Execute rearrangement (should be no change) - rearrange_expert_weights_inplace( - indices, - indices, # Same indices - expert_weights, - ep_group, - is_profile=False, - ) - - # Verify that the weights have not changed - for layer in range(num_layers): - for weight_idx in range(len(hidden_sizes)): - torch.testing.assert_close( - expert_weights[layer][weight_idx], - original_weights[layer][weight_idx], - msg=f"Layer {layer}, weight {weight_idx} should remain unchanged", - ) - distributed_run(worker_fn, world_size) +def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") + + num_layers = 1 + num_local_experts = 2 + total_physical_experts = world_size * num_local_experts + num_logical_experts = total_physical_experts // 2 + hidden_sizes = [32] + + # Create different index distributions + old_redundancy = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + new_redundancy = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + + old_indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, old_redundancy + ) + new_indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, new_redundancy + ) + + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices + ) + + # Save original weights + original_weights = [] + for layer_weights in expert_weights: + layer_copy = [] + for weight in layer_weights: + layer_copy.append(weight.clone()) + original_weights.append(layer_copy) + + # Execute profile mode rearrangement + rearrange_expert_weights_inplace( + old_indices, + new_indices, + expert_weights, + ep_group, + is_profile=True, # Profile mode + ) + + # In profile mode, the weights should remain unchanged + for layer in range(num_layers): + for weight_idx in range(len(hidden_sizes)): + torch.testing.assert_close( + expert_weights[layer][weight_idx], + original_weights[layer][weight_idx], + msg="In profile mode, the weights should remain unchanged", + ) @pytest.mark.parametrize("world_size", [2, 4]) @@ -452,66 +517,4 @@ def test_rearrange_expert_weights_profile_mode(world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") - - @worker_fn_wrapper - def worker_fn(): - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") - - num_layers = 1 - num_local_experts = 2 - total_physical_experts = world_size * num_local_experts - num_logical_experts = total_physical_experts // 2 - hidden_sizes = [32] - - # Create different index distributions - old_redundancy = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - new_redundancy = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - - old_indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, old_redundancy - ) - new_indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, new_redundancy - ) - - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices - ) - - # Save original weights - original_weights = [] - for layer_weights in expert_weights: - layer_copy = [] - for weight in layer_weights: - layer_copy.append(weight.clone()) - original_weights.append(layer_copy) - - # Execute profile mode rearrangement - rearrange_expert_weights_inplace( - old_indices, - new_indices, - expert_weights, - ep_group, - is_profile=True, # Profile mode - ) - - # In profile mode, the weights should remain unchanged - for layer in range(num_layers): - for weight_idx in range(len(hidden_sizes)): - torch.testing.assert_close( - expert_weights[layer][weight_idx], - original_weights[layer][weight_idx], - msg="In profile mode, the weights should remain unchanged", - ) - - distributed_run(worker_fn, world_size) + distributed_run(_test_rearrange_expert_weights_profile_mode, world_size) From 2c52c7fd9a480f96ac93e63eccf9a3ee01686ad4 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 03:52:23 -0500 Subject: [PATCH 231/578] [Bug] Fix torch dynamo warning Dynamo detected a call to a `functools.lru_cache` (#29038) Signed-off-by: yewentao256 --- tests/v1/determinism/conftest.py | 5 +-- tests/v1/determinism/test_batch_invariance.py | 35 +++++-------------- .../test_online_batch_invariance.py | 12 +++++-- tests/v1/determinism/utils.py | 20 +++++++++++ vllm/model_executor/layers/batch_invariant.py | 20 ++++++----- 5 files changed, 52 insertions(+), 40 deletions(-) diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py index 3c2136e00584..bde02bbd0d5c 100644 --- a/tests/v1/determinism/conftest.py +++ b/tests/v1/determinism/conftest.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import pytest +import vllm.model_executor.layers.batch_invariant as batch_invariant + @pytest.fixture(autouse=True) def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch): """Automatically enable batch invariant kernel overrides for all tests.""" + monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", True) monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") - yield diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index d4e88891512c..74ae5e182da7 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -6,29 +6,16 @@ import pytest import torch -from utils import _extract_step_logprobs, _random_prompt, skip_unsupported +from utils import ( + BACKENDS, + _extract_step_logprobs, + _random_prompt, + resolve_model_name, + skip_unsupported, +) +import vllm.model_executor.layers.batch_invariant as batch_invariant from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -BACKENDS: list[str] = [ - "FLASH_ATTN", - "FLASHINFER", -] - -if current_platform.is_cuda() and current_platform.is_device_capability(90): - BACKENDS.append("FLASH_ATTN_MLA") - -DEFAULT_MODEL = "Qwen/Qwen3-1.7B" -MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" - - -def resolve_model_name(backend: str) -> str: - """Resolve the model name for the given backend, respecting env overrides.""" - model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) - if backend.endswith("MLA") and model == DEFAULT_MODEL: - return MLA_MODEL - return model @skip_unsupported @@ -454,14 +441,10 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant - - vllm_is_batch_invariant.cache_clear() monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test - monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") - + monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False) seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index 23f47863dd23..d74b435797f8 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -16,7 +16,8 @@ from typing import Any import openai -from utils import _random_prompt, skip_unsupported +import pytest +from utils import BACKENDS, _random_prompt, resolve_model_name, skip_unsupported from tests.utils import RemoteOpenAIServer @@ -133,9 +134,14 @@ def _compare_bs1_vs_bsn_single_process( @skip_unsupported -def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(): +@pytest.mark.parametrize("backend", BACKENDS) +def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( + backend: str, monkeypatch: pytest.MonkeyPatch +) -> None: random.seed(int(os.getenv("VLLM_TEST_SEED", "12345"))) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + # Override backend for this test (and the RemoteOpenAIServer child process). + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) + model_name = resolve_model_name(backend) prompts_all = [_random_prompt(10, 50) for _ in range(32)] sp_kwargs: dict[str, Any] = { diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 5141837faea0..7ee442551e2c 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os import random import pytest @@ -12,6 +13,25 @@ reason="Requires CUDA and >= Hopper (SM90)", ) +BACKENDS: list[str] = [ + "FLASH_ATTN", + "FLASHINFER", +] + +if current_platform.is_cuda() and current_platform.is_device_capability(90): + BACKENDS.append("FLASH_ATTN_MLA") + +DEFAULT_MODEL = "Qwen/Qwen3-1.7B" +MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" + + +def resolve_model_name(backend: str) -> str: + """Resolve the model name for the given backend.""" + model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) + if backend.endswith("MLA") and model == DEFAULT_MODEL: + return MLA_MODEL + return model + def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: # Generate more realistic prompts that will actually produce varied tokens diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 5dbeb2917434..69fa6bdffd43 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Callable -from functools import cache from typing import Any import torch @@ -785,16 +784,19 @@ def enable_batch_invariant_mode(): torch.backends.cuda.preferred_blas_library(backend="cublaslt") -@cache -def vllm_is_batch_invariant(): - env_key = "VLLM_BATCH_INVARIANT" - is_overridden = False - val = os.getenv(env_key, "0") +def _read_vllm_batch_invariant() -> bool: + val = os.getenv("VLLM_BATCH_INVARIANT", "0") try: - is_overridden = int(val) != 0 + return int(val) != 0 except ValueError: - is_overridden = False - return is_overridden + return False + + +VLLM_BATCH_INVARIANT: bool = _read_vllm_batch_invariant() + + +def vllm_is_batch_invariant() -> bool: + return VLLM_BATCH_INVARIANT def override_envs_for_invariance(): From 322cb02872d806afcaaa7d0aac3fad7f304b7888 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 03:48:09 -0600 Subject: [PATCH 232/578] [CI/Build][AMD] Fix import errors in tests/kernels/attention (#29032) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../attention/test_cascade_flash_attn.py | 18 +++++++++++++----- tests/kernels/attention/test_flash_attn.py | 19 ++++++++++++++----- tests/kernels/attention/test_flashinfer.py | 12 ++++++++++-- .../attention/test_flashinfer_mla_decode.py | 3 ++- .../test_flashinfer_trtllm_attention.py | 3 ++- tests/kernels/moe/test_flashinfer.py | 9 ++++++++- 6 files changed, 49 insertions(+), 15 deletions(-) diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index 20f573821b25..d86041d71feb 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -7,11 +7,19 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states -from vllm.vllm_flash_attn import ( - fa_version_unsupported_reason, - flash_attn_varlen_func, - is_fa_version_supported, -) + +try: + from vllm.vllm_flash_attn import ( + fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported, + ) +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "vllm_flash_attn is not supported for vLLM on ROCm.", + allow_module_level=True, + ) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 192, 256] diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 26b8c77ab482..bbd5df5419f8 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -6,11 +6,20 @@ import torch from vllm.platforms import current_platform -from vllm.vllm_flash_attn import ( - fa_version_unsupported_reason, - flash_attn_varlen_func, - is_fa_version_supported, -) + +try: + from vllm.vllm_flash_attn import ( + fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported, + ) +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "vllm_flash_attn is not supported for vLLM on ROCm.", + allow_module_level=True, + ) + NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [40, 72, 80, 128, 256] diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 82ec2ef14e56..eedeec33e0d4 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -2,12 +2,20 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import flashinfer import pytest -import torch from vllm.platforms import current_platform +try: + import flashinfer +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True + ) + +import torch + NUM_HEADS = [(32, 8), (6, 1)] HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py index 0350136677c6..d183f67d3919 100644 --- a/tests/kernels/attention/test_flashinfer_mla_decode.py +++ b/tests/kernels/attention/test_flashinfer_mla_decode.py @@ -3,7 +3,6 @@ import pytest import torch import torch.nn.functional as F -from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla from torch import Tensor from vllm.platforms import current_platform @@ -15,6 +14,8 @@ reason="FlashInfer MLA Requires compute capability of 10 or above.", allow_module_level=True, ) +else: + from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla def ref_mla( diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 693b849ebc5d..98ea40608b46 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import flashinfer import pytest import torch @@ -16,6 +15,8 @@ pytest.skip( "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True ) +else: + import flashinfer FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 218df4a2632c..638741e91619 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -22,7 +22,14 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8 from vllm.model_executor.models.llama4 import Llama4MoE from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +try: + from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "flashinfer not supported for vLLM on ROCm", allow_module_level=True + ) if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability( 90 From a903d59ffaffd9160c517fa337b3ab0265a898c3 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 20 Nov 2025 02:51:36 -0800 Subject: [PATCH 233/578] cleanup at::Tag::needs_fixed_stride_order (#28974) Signed-off-by: Boyuan Feng Co-authored-by: Cyrus Leung --- csrc/cpu/torch_bindings.cpp | 7 ++-- csrc/torch_bindings.cpp | 64 +++++++++++-------------------------- 2 files changed, 20 insertions(+), 51 deletions(-) diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b07d20bab7dd..e0e3ef71b485 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -172,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ defined(__powerpc64__) - at::Tag stride_tag = at::Tag::needs_fixed_stride_order; // Helper function to release oneDNN handlers ops.def("release_dnnl_matmul_handler(int handler) -> ()", &release_dnnl_matmul_handler); @@ -208,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()", - {stride_tag}); + "Tensor? azp) -> ()"); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()", - {stride_tag}); + "Tensor!? azp) -> ()"); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); #endif diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c3ae06a30e3e..5af74c2c2a6b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops // - // The default behavior in PyTorch 2.6 was changed to "requires_contiguous", - // so we need - // to override this for many GEMMs with the following tag. Otherwise, - // torch.compile will force all input tensors to be contiguous(), which - // will break many custom ops that require column-major weight matrices. - // This was a bug and PyTorch 2.7 has since fixed this. -#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6 - #define stride_tag at::Tag::needs_fixed_stride_order -#else - #define stride_tag -#endif - ops.def( "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! " "y_q, Tensor! y_s," @@ -241,15 +229,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters) -> Tensor"); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); // Dequantization for AWQ. ops.def( "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor"); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); // Note about marlin kernel 'workspace' arguments: @@ -271,8 +257,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "Tensor b_scales, Tensor workspace, " "int b_q_type, " - "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor", - {stride_tag}); + "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor"); // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. @@ -298,8 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor? channel_scales," " Tensor? token_scales," " str? schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); ops.def( "machete_prepack_B(" " Tensor B," @@ -319,8 +303,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " - "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor", - {stride_tag}); + "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); // conditionally compiled so impl registration is in source file // gptq_marlin repack from GPTQ. @@ -346,8 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor token_scales," " ScalarType? out_type," " str? maybe_schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); // pack scales ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor"); // encode and reorder weight matrix @@ -394,24 +376,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," " Tensor block_scale_a, Tensor block_scale_b," - " Tensor alpha) -> ()", - {stride_tag}); + " Tensor alpha) -> ()"); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); // cutlass blockwise scaledgroup GEMM ops.def( "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " "Tensor scales_a, Tensor scales_b, " - "Tensor problem_sizes, Tensor expert_offsets) -> ()", - {stride_tag}); + "Tensor problem_sizes, Tensor expert_offsets) -> ()"); // conditionally compiled so impl registration is in source file // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas," - " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()", - {stride_tag}); + " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()"); // conditionally compiled so impl registration is in source file // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column @@ -419,8 +398,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_mm(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column @@ -429,8 +407,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_mm_azp(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); + " Tensor? azp, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp); // Check if cutlass scaled_mm is supported for CUDA devices of the given @@ -449,8 +426,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor a_scales, Tensor b_scales, Tensor expert_offsets, " " Tensor problem_sizes, Tensor a_strides, " " Tensor b_strides, Tensor c_strides, bool per_act_token, " - " bool per_out_ch) -> ()", - {stride_tag}); + " bool per_out_ch) -> ()"); ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm); // A function that computes data required to run fused MoE with w8a8 grouped @@ -464,8 +440,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, Tensor! problem_sizes2, " " Tensor! input_permutation, " " Tensor! output_permutation, int num_experts, " - " int n, int k, Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " int n, int k, Tensor? blockscale_offsets) -> " + "()"); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); // A function that computes problem sizes for each expert's multiplication @@ -476,8 +452,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, " " Tensor! problem_sizes2, " " int num_experts, int n, int k, " - " Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " Tensor? blockscale_offsets) -> ()"); ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, &get_cutlass_moe_mm_problem_sizes); @@ -492,8 +467,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes2, " " Tensor expert_num_tokens, " " int num_local_experts, int padded_m, " - " int n, int k) -> ()", - {stride_tag}); + " int n, int k) -> ()"); ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA, &get_cutlass_pplx_moe_mm_data); @@ -517,8 +491,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_sparse_mm(Tensor! out, Tensor a," " Tensor bt_nzs," " Tensor bt_meta, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm); // CUTLASS sparse matrix compressor @@ -567,8 +540,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, " "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool " "use_v2_format, int bit) " - "-> Tensor", - {stride_tag}); + "-> Tensor"); ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // Post processing for GPTQ. From fb8851f25485c3c94b0a71b77ff800f55ba328cf Mon Sep 17 00:00:00 2001 From: Vensen Date: Thu, 20 Nov 2025 18:52:02 +0800 Subject: [PATCH 234/578] [Bugfix][cache_kernels]: Fix OOB in cache_kernels.cu (#28760) Signed-off-by: vensen Signed-off-by: Vensenmu --- csrc/cache_kernels.cu | 19 +++++---- tests/kernels/test_cache_kernels.py | 65 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 tests/kernels/test_cache_kernels.py diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0aa0dc14c748..a6c953ee0eac 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -965,7 +965,9 @@ __global__ void gather_and_maybe_dequant_cache( } }; - for (int pid = split_start; pid < full_blocks_end; ++pid) { + const auto loop_end = + std::min((int64_t)full_blocks_end, block_table_stride - offset); + for (int pid = split_start; pid < loop_end; ++pid) { auto block_id = batch_block_table[pid]; auto block_start_ptr = src_cache + block_id * cache_block_stride; auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; @@ -976,12 +978,15 @@ __global__ void gather_and_maybe_dequant_cache( } if (partial_block_size) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + if (offset + full_blocks_end < block_table_stride) { + auto block_id = batch_block_table[full_blocks_end]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = + dst + full_blocks_end * block_size * dst_entry_stride; + for (int eid = 0; eid < partial_block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } } } } diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py new file mode 100644 index 000000000000..b5d66b4ede88 --- /dev/null +++ b/tests/kernels/test_cache_kernels.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for CUDA kernels in cache_kernels.cu.""" + +import pytest +import torch + +try: + from vllm import _custom_ops as ops +except ImportError: + pytest.skip( + "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True + ) + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device") +def test_gather_cache_oob(): + """ + Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909). + This test constructs a boundary case identified in the issue where + seq_starts causes the block_table offset to read out of bounds. + """ + + batch_size = 1 + block_size = 64 + entry_size = 128 + + block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda") + + # This will result in offset = 128 / block_size = 128 / 64 = 2 + # This will cause the kernel to try to read from + # block_table[0, 2], but its size is only 2. + seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda") + + seq_len = 65 + cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda") + + # src_cache: [num_blocks, block_size, entry_size] + num_blocks = 5 + src_cache = torch.randn( + (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda" + ) + + dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda") + + scale = torch.tensor([1.0], dtype=torch.float32, device="cuda") + + # Calling the C++ function gather_and_maybe_dequant_cache + ops.gather_and_maybe_dequant_cache( + src_cache, + dst, + block_table, + cu_seq_lens, + batch_size, + "auto", # kv_cache_dtype + scale, + seq_starts, + ) + + torch.cuda.synchronize() + assert True + + +if __name__ == "__main__": + pytest.main([__file__]) From dc45efc8ef7fc1e2571331eaf4671e1652e2a865 Mon Sep 17 00:00:00 2001 From: Dezhan Date: Thu, 20 Nov 2025 02:52:36 -0800 Subject: [PATCH 235/578] [BugFix] Fix Llama4 Pipeline Parallelism Assert Error (#28577) Co-authored-by: Dezhan Tu --- vllm/model_executor/models/llama4.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 4c6d1d424475..e1bdfc3405f7 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -53,6 +53,7 @@ from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel from .utils import ( AutoWeightsLoader, + PPMissingLayer, extract_layer_index, fast_topk, is_pp_missing_parameter, @@ -729,6 +730,9 @@ def set_moe_parameters(self): self.moe_layers = [] example_moe = None for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, Llama4DecoderLayer) if isinstance(layer.feed_forward, Llama4MoE): # Pick last one layer since the first ones may be dense layers. @@ -765,6 +769,9 @@ def update_physical_experts_metadata( self.num_local_physical_experts = num_local_physical_experts self.num_redundant_experts = num_physical_experts - self.num_logical_experts for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + if isinstance(layer.feed_forward, Llama4MoE): moe = layer.feed_forward moe.n_local_physical_experts = num_local_physical_experts From edfe867208482ccadbf0ef503fc43e1fbb1e48f6 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Thu, 20 Nov 2025 18:52:53 +0800 Subject: [PATCH 236/578] [Misc] don't cache `CUTLASS_REVISION` var in CMakeLists.txt (#28518) Signed-off-by: Jinzhen Lin Co-authored-by: Lucas Wilkinson --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae8e6175443f..a4cf51d17e98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) From 66483a9d00e4e26647dd26b4c49f6eca73972b8c Mon Sep 17 00:00:00 2001 From: cjackal <44624812+cjackal@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:53:09 +0900 Subject: [PATCH 237/578] [Chore] Update `xgrammar` version from 0.1.25 to 0.1.27 (#28221) Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com> --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 1058ab91a02a..f2d1c0762ef6 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,7 +24,7 @@ outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" +xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs From 6eb745d9bdf5b69bb63f897b32465c62ecb9e14a Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Thu, 20 Nov 2025 02:53:50 -0800 Subject: [PATCH 238/578] Add truncate arg to yarn to match openai implementation of gpt-oss (#28244) Signed-off-by: ashors1 Co-authored-by: Chen Zhang --- .../layers/rotary_embedding/__init__.py | 1 + .../layers/rotary_embedding/common.py | 14 +++++++------- .../layers/rotary_embedding/yarn_scaling_rope.py | 3 +++ vllm/model_executor/models/gpt_oss.py | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index ae8a7d93b50e..152d9401b8e9 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -197,6 +197,7 @@ def get_rope( "beta_fast", "beta_slow", "apply_yarn_scaling", + "truncate", ) } if "mrope_section" in rope_parameters: diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 196533b61795..13f8d15cc0f7 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -117,13 +117,13 @@ def yarn_find_correction_range( dim: int, base: float = 10000, max_position_embeddings: int = 2048, -) -> tuple[int, int]: - low = math.floor( - yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) - ) - high = math.ceil( - yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) - ) + truncate: bool = True, +) -> tuple[float | int, float | int]: + low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + if truncate: + low = math.floor(low) + high = math.ceil(high) return max(low, 0), min(high, dim - 1) # Clamp values just in case diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py index ff46ad74b302..f01ca1e23121 100644 --- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py @@ -28,12 +28,14 @@ def __init__( beta_fast: int = 32, beta_slow: int = 1, apply_yarn_scaling: bool = True, + truncate: bool = True, ) -> None: self.scaling_factor = scaling_factor self.extrapolation_factor = extrapolation_factor self.attn_factor = attn_factor self.beta_fast = beta_fast self.beta_slow = beta_slow + self.truncate = truncate # Get n-d magnitude scaling corrected for interpolation self.mscale = ( float(yarn_get_mscale(self.scaling_factor) * attn_factor) @@ -57,6 +59,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: self.rotary_dim, self.base, self.max_position_embeddings, + self.truncate, ) # Get n-d rotational scaling corrected for extrapolation inv_freq_mask = ( diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 25048330f797..8835acb8ec65 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -78,6 +78,7 @@ def __init__( ], "beta_fast": config.rope_parameters["beta_fast"], "beta_slow": config.rope_parameters["beta_slow"], + "truncate": config.rope_parameters.get("truncate", True), }, is_neox_style=True, ) From 06c20c9904644d8f65523bb747756b2eae706b8e Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Thu, 20 Nov 2025 18:54:01 +0800 Subject: [PATCH 239/578] [ROCm] Add AMD GPU support on Deepseek v3.2 and SparseMLA (#26670) Signed-off-by: ganyi --- csrc/cache_kernels.cu | 4 + vllm/attention/ops/rocm_aiter_mla_sparse.py | 210 +++++++++++ vllm/model_executor/models/deepseek_v2.py | 22 +- vllm/platforms/rocm.py | 13 +- vllm/utils/deep_gemm.py | 5 +- .../attention/backends/mla/flashmla_sparse.py | 2 +- vllm/v1/attention/backends/mla/indexer.py | 15 +- .../backends/mla/rocm_aiter_mla_sparse.py | 325 ++++++++++++++++++ vllm/v1/worker/utils.py | 2 +- 9 files changed, 583 insertions(+), 15 deletions(-) create mode 100644 vllm/attention/ops/rocm_aiter_mla_sparse.py create mode 100644 vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index a6c953ee0eac..32960cc8073b 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel( #ifndef USE_ROCM __syncwarp(); #endif +#if defined(__gfx942__) + float scale = fmaxf(amax, 1e-4) / 224.0f; +#else float scale = fmaxf(amax, 1e-4) / 448.0f; +#endif if (use_ue8m0) { scale = exp2f(ceilf(log2f(scale))); } diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/attention/ops/rocm_aiter_mla_sparse.py new file mode 100644 index 000000000000..080e92ecc940 --- /dev/null +++ b/vllm/attention/ops/rocm_aiter_mla_sparse.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +from functools import lru_cache + +import torch + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +# Take from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L84 +def fp8_mqa_logits_torch( + q: torch.Tensor, + kv: tuple[torch.Tensor, torch.Tensor], + weights: torch.Tensor, + cu_seqlen_ks: torch.Tensor, + cu_seqlen_ke: torch.Tensor, +) -> torch.Tensor: + """Compute FP8 MQA logits for a single sequence without KV paging. + + Args: + q: Query tensor of shape [M, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with + dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or + [N, 1]) with dtype `torch.float32`. + weights: weights of shape [M, H], dtype `torch.float32`. + cu_seqlen_ks: Start indices (inclusive) for valid K per query position, + shape [M], dtype int32. + cu_seqlen_ke: End indices (exclusive) for valid K per query position, + shape [M], dtype int32. + + Returns: + Logits tensor of shape [M, N], dtype `torch.float32`. + """ + kv, scale = kv + seq_len_kv = kv.shape[0] + k = kv.to(torch.bfloat16) + q = q.to(torch.bfloat16) + + mask_lo = ( + torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None] + ) + mask_hi = ( + torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None] + ) + mask = mask_lo & mask_hi + + score = torch.einsum("mhd,nd->hmn", q, k).float() * scale + logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0) + logits = logits.masked_fill(~mask, float("-inf")) + + return logits + + +def rocm_fp8_mqa_logits( + q: torch.Tensor, + kv: tuple[torch.Tensor, torch.Tensor], + weights: torch.Tensor, + cu_seqlen_ks: torch.Tensor, + cu_seqlen_ke: torch.Tensor, +) -> torch.Tensor: + """Compute FP8 MQA logits for a single sequence without KV paging. + + Args: + q: Query tensor of shape [M, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with + dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or + [N, 1]) with dtype `torch.float32`. + weights: weights of shape [M, H], dtype `torch.float32`. + cu_seqlen_ks: Start indices (inclusive) for valid K per query position, + shape [M], dtype int32. + cu_seqlen_ke: End indices (exclusive) for valid K per query position, + shape [M], dtype int32. + + Returns: + Logits tensor of shape [M, N], dtype `torch.float32`. + """ + + # TODO(ganyi): Temporarily workaround, will remove the module check and reference + # path after aiter merge this kernel into main + @lru_cache + def has_mqa_logits_module(): + return importlib.util.find_spec("aiter.ops.triton.fp8_mqa_logits") is not None + + if rocm_aiter_ops.is_enabled() and has_mqa_logits_module(): + from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits + + kv, scale = kv + return fp8_mqa_logits(q, kv, scale, weights, cu_seqlen_ks, cu_seqlen_ke) + else: + return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke) + + +# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L156 +def fp8_paged_mqa_logits_torch( + q: torch.Tensor, + kv_cache: torch.Tensor, + weights: torch.Tensor, + context_lens: torch.Tensor, + block_tables: torch.Tensor, + max_model_len: int, +): + from vllm.utils.math_utils import cdiv + + fp8_dtype = current_platform.fp8_dtype() + batch_size, next_n, _, dim = q.size() + kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:] + scale = scale.contiguous().view(torch.float) + q = q.float() + kv_cache = kv_cache.view(fp8_dtype).float() * scale + num_block, block_size, _, dim = kv_cache.size() + logits = torch.full( + [batch_size * next_n, max_model_len], + float("-inf"), + device=q.device, + dtype=torch.float32, + ) + context_lens = context_lens.tolist() + for i in range(batch_size): + context_len = context_lens[i] + q_offsets = torch.arange(context_len - next_n, context_len, device="cuda") + weight_slice = ( + weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous() + ) + for block_rk in range(cdiv(context_len, block_size)): + block_idx = block_tables[i][block_rk] + qx, kx = q[i], kv_cache[block_idx] + k_offsets = torch.arange( + block_rk * block_size, (block_rk + 1) * block_size, device="cuda" + ) + mask = (k_offsets[None, :] < context_len) & ( + k_offsets[None, :] <= q_offsets[:, None] + ) + s = torch.where( + mask[None, :, :], + (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to( + logits.dtype + ), + float("-inf"), + ) + s = torch.relu(s) * weight_slice[..., None] + s = s.sum(dim=0) + logits[ + i * next_n : (i + 1) * next_n, + block_rk * block_size : (block_rk + 1) * block_size, + ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf")) + return logits + + +def rocm_fp8_paged_mqa_logits( + q_fp8: torch.Tensor, + kv_cache_fp8: torch.Tensor, + weights: torch.Tensor, + context_lens: torch.Tensor, + block_tables: torch.Tensor, + schedule_metadata: torch.Tensor, + max_model_len: int, +) -> torch.Tensor: + """Compute FP8 MQA logits using paged KV-cache. + + Args: + q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape + [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last + 4 bytes per (block,pos) store the `float` dequant scale. + weights: Tensor of shape [B * next_n, H], dtype `torch.float32`. + context_lens: Tensor of shape [B], dtype int32; effective context length + for each batch element. + block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical + block indices to physical blocks in the paged cache. + schedule_metadata: Returned by `get_paged_mqa_logits_metadata`; + used to distribute work across SMs. + max_model_len: Maximum sequence length used to size the logits output. + + Returns: + Logits tensor of shape [B * next_n, max_model_len], dtype + `torch.float32`. + """ + + if rocm_aiter_ops.is_enabled(): + from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits_stage1 + + batch_size, next_n, heads, _ = q_fp8.shape + out_qk = torch.full( + (heads, batch_size * next_n, max_model_len), + float("-inf"), + device="cuda", + dtype=torch.float32, + ) + deepgemm_fp8_paged_mqa_logits_stage1( + q_fp8, + kv_cache_fp8, + weights, + out_qk, + context_lens, + block_tables, + max_model_len, + ) + return out_qk.sum(dim=0) + else: + return fp8_paged_mqa_logits_torch( + q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len + ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d0a116b97997..7cfd381592b4 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -594,6 +594,7 @@ def sparse_attn_indexer( ) -> torch.Tensor: # careful! this will be None in dummy run attn_metadata = get_forward_context().attn_metadata + fp8_dtype = current_platform.fp8_dtype() # assert isinstance(attn_metadata, dict) if not isinstance(attn_metadata, dict): return sparse_attn_indexer_fake( @@ -633,7 +634,7 @@ def sparse_attn_indexer( k_fp8 = torch.empty( [chunk.total_seq_lens, head_dim], device=k.device, - dtype=torch.float8_e4m3fn, + dtype=fp8_dtype, ) k_scale = torch.empty( [chunk.total_seq_lens, 4], @@ -647,7 +648,12 @@ def sparse_attn_indexer( chunk.block_table, chunk.cu_seq_lens, ) - logits = fp8_mqa_logits( + fp8_mqa_logits_func = fp8_mqa_logits + if current_platform.is_rocm(): + from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits + + fp8_mqa_logits_func = rocm_fp8_mqa_logits + logits = fp8_mqa_logits_func( q_fp8[chunk.token_start : chunk.token_end], (k_fp8, k_scale.view(torch.float32)), weights[chunk.token_start : chunk.token_end], @@ -692,7 +698,14 @@ def sparse_attn_indexer( next_n = padded_q_fp8_decode_tokens.shape[1] assert batch_size == decode_metadata.seq_lens.shape[0] num_padded_tokens = batch_size * next_n - logits = fp8_paged_mqa_logits( + fp8_paged_mqa_logits_func = fp8_paged_mqa_logits + if current_platform.is_rocm(): + from vllm.attention.ops.rocm_aiter_mla_sparse import ( + rocm_fp8_paged_mqa_logits, + ) + + fp8_paged_mqa_logits_func = rocm_fp8_paged_mqa_logits + logits = fp8_paged_mqa_logits_func( padded_q_fp8_decode_tokens, kv_cache, weights[:num_padded_tokens], @@ -749,7 +762,8 @@ def sparse_attn_indexer_fake( _flattened_kv = torch.empty( [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8 ) - _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous() + fp8_dtype = current_platform.fp8_dtype() + _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous() _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous() return topk_indices_buffer diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f07f068a9249..1a2f9226ddce 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -225,7 +225,18 @@ def get_attn_backend_cls( from vllm.attention.backends.registry import AttentionBackendEnum if use_sparse: - raise NotImplementedError("Sparse Attention is not supported on ROCm.") + if kv_cache_dtype.startswith("fp8"): + raise ValueError( + "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype." + ) + assert block_size == 1, ( + "Sparse MLA backend on ROCm only supports block size 1 for now." + ) + logger.info_once("Using Sparse MLA backend on V1 engine.") + return ( + "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse." + "ROCMAiterMLASparseBackend" + ) if use_mla: if selected_backend is None: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 6b0a383a0e28..b25c1e3e1ece 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -325,6 +325,7 @@ def _align(x: int, y: int) -> int: def per_block_cast_to_fp8( x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False ) -> tuple[torch.Tensor, torch.Tensor]: + fp8_dtype = current_platform.fp8_dtype() assert x.dim() == 2 m, n = x.shape block_m, block_n = block_size @@ -334,9 +335,9 @@ def per_block_cast_to_fp8( x_padded[:m, :n] = x x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - sf = x_amax / 448.0 + sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0 sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf - x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) + x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype) return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view( x_view.size(0), x_view.size(2) ) diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index bb8d914d1571..3f2cc8c38327 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -168,7 +168,7 @@ def _convert_req_index_to_global_index_kernel( inblock_off = tok % BLOCK_SIZE # Guard block_table access - valid_block = block_id < max_num_blocks_per_req + valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0) bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1 base = tl.load(bt_ptr, mask=valid_block, other=0) diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index 37aa5dad89a0..cc0988435768 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -11,7 +11,8 @@ ) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata +from vllm.platforms import current_platform +from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -23,7 +24,9 @@ class DeepseekV32IndexerBackend(AttentionBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] + supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [ + 1 if current_platform.is_rocm() else 64 + ] @classmethod def get_supported_head_sizes(cls) -> list[int]: @@ -328,10 +331,10 @@ def build( requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item() seq_lens = common_attn_metadata.seq_lens[:num_decodes] - - self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata( - seq_lens, self.kv_cache_spec.block_size, self.num_sms - ) + if is_deep_gemm_supported(): + self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata( + seq_lens, self.kv_cache_spec.block_size, self.num_sms + ) decode_metadata = DeepSeekV32IndexerDecodeMetadata( block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...], seq_lens=common_attn_metadata.seq_lens[:num_decodes], diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py new file mode 100644 index 000000000000..c0e7f0e380b9 --- /dev/null +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -0,0 +1,325 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar, Optional + +import numpy as np +import torch + +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, +) +from vllm.attention.backends.utils import get_mla_dims +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import ( + MLACommonBaseImpl, +) +from vllm.v1.attention.backends.mla.flashmla_sparse import ( + triton_convert_req_index_to_global_index, +) +from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata, +) +from vllm.v1.kv_cache_interface import AttentionSpec + +if TYPE_CHECKING: + from vllm.model_executor.models.deepseek_v2 import Indexer +logger = init_logger(__name__) + + +class ROCMAiterMLASparseBackend(AttentionBackend): + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ROCM_AITER_MLA_SPARSE" + + @staticmethod + def get_metadata_cls() -> type[AttentionMetadata]: + return ROCMAiterMLASparseMetadata + + @staticmethod + def get_builder_cls() -> type["ROCMAiterMLASparseMetadataBuilder"]: + return ROCMAiterMLASparseMetadataBuilder + + @staticmethod + def get_impl_cls() -> type["ROCMAiterMLASparseImpl"]: + return ROCMAiterMLASparseImpl + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, # assumed to be 1 for MLA + head_size: int, + cache_dtype_str: str = "auto", + ) -> tuple[int, ...]: + return (num_blocks, block_size, head_size) + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [576] + + +@dataclass +class ROCMAiterMLASparseMetadata: + num_reqs: int + max_query_len: int + max_seq_len: int + + num_actual_tokens: int # Number of tokens excluding padding. + query_start_loc: torch.Tensor + slot_mapping: torch.Tensor + + block_table: torch.Tensor + req_id_per_token: torch.Tensor + block_size: int = 1 + topk_tokens: int = 2048 + + +@dataclass +class ROCMAiterMLASparseMetadataBuilder( + AttentionMetadataBuilder[ROCMAiterMLASparseMetadata] +): + cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + self.kv_cache_spec = kv_cache_spec + self.model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config + self.device = device + + self.num_heads = self.model_config.get_num_attention_heads(parallel_config) + self.mla_dims = get_mla_dims(self.model_config) + self.topk_tokens = vllm_config.model_config.hf_config.index_topk + self.topk_tokens_tensor = torch.tensor( + [self.topk_tokens], device=device, dtype=torch.int32 + ) + self.max_model_len_tensor = torch.tensor( + [self.model_config.max_model_len], device=device, dtype=torch.int32 + ) + # this is ignored by `flash_mla_with_kvcache` if indices not None + self.dummy_block_table = torch.empty( + (1, 1), dtype=torch.int32, device=self.device + ) + + self.req_id_per_token_buffer = torch.empty( + (vllm_config.scheduler_config.max_num_batched_tokens,), + dtype=torch.int32, + device=device, + ) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> ROCMAiterMLASparseMetadata: + num_tokens = common_attn_metadata.num_actual_tokens + starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32) + seg_lengths = np.diff(starts) + req_id_per_token = np.repeat( + np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths + ) + # Zero-fill for cudagraphs + self.req_id_per_token_buffer.fill_(0) + self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_( + torch.from_numpy(req_id_per_token), non_blocking=True + ) + req_id_per_token = self.req_id_per_token_buffer[:num_tokens] + + metadata = ROCMAiterMLASparseMetadata( + num_reqs=common_attn_metadata.num_reqs, + max_query_len=common_attn_metadata.max_query_len, + max_seq_len=common_attn_metadata.max_seq_len, + num_actual_tokens=common_attn_metadata.num_actual_tokens, + query_start_loc=common_attn_metadata.query_start_loc, + slot_mapping=common_attn_metadata.slot_mapping, + block_table=common_attn_metadata.block_table_tensor, + req_id_per_token=req_id_per_token, + block_size=self.kv_cache_spec.block_size, + topk_tokens=self.topk_tokens, + ) + return metadata + + +# Take from +# https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_prefill.py#L72 +def reference_mla_sparse_prefill( + q: torch.Tensor, kv: torch.Tensor, indices: torch.Tensor, sm_scale: float, d_v: int +) -> tuple[torch.Tensor, torch.Tensor]: + import math + + def log2sumexp2(a: torch.Tensor, dim: int) -> torch.Tensor: + return torch.logsumexp(a * math.log(2), dim=dim) * math.log2(math.e) + + skv = kv.shape[0] + sq = q.shape[0] + topk = indices.shape[-1] + dqk = q.shape[-1] + indices = indices[:, 0, :] # [s_q, topk] + invalid_indices_mask = (indices < 0) | (indices >= skv) + indices[invalid_indices_mask] = 0 + qs = q # [s_q, h_q, d_qk] + kvs = kv[:, 0, :][indices].view(sq, topk, dqk) # [s_q, topk, d_qk] + + attn_score = (qs @ kvs.transpose(1, 2)).float() # [s_q, h_q, topk] + attn_score.masked_fill_(invalid_indices_mask.unsqueeze(1), float("-inf")) + attn_score *= sm_scale * math.log2(math.e) + lse = log2sumexp2(attn_score, dim=-1) # [s_q, h_q] + attn_score = torch.exp2(attn_score - lse.unsqueeze(-1)) # [s_q, h_q, topk] + result = attn_score.to(q.dtype) @ kvs[:, :, :d_v] + return (result, lse) + + +class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + topk_indice_buffer: torch.Tensor | None = None, + indexer: Optional["Indexer"] = None, + **mla_args, + ) -> None: + super().__init__( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **mla_args, + ) + self.softmax_scale = scale + assert indexer is not None + self.topk_indices_buffer = indexer.topk_indices_buffer + self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled() + + def _forward_bf16_kv( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + attn_metadata: ROCMAiterMLASparseMetadata, + ) -> torch.Tensor: + num_tokens = q.shape[0] + kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view( + -1, 1, kv_c_and_k_pe_cache.shape[-1] + ) + + topk_indices = topk_indices.view(num_tokens, 1, -1) + output = reference_mla_sparse_prefill( + q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale, 512 + )[0] + return output[:, : self.num_heads, :] + + def forward( + self, + layer: AttentionLayer, + q: torch.Tensor, + k_c_normed: torch.Tensor, # key in unified attn + k_pe: torch.Tensor, # value in unified attn + kv_cache: torch.Tensor, + attn_metadata: ROCMAiterMLASparseMetadata, + output: torch.Tensor | None = None, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use + # MQA 576/512 approach for both prefill and decode + + assert output is not None, "Output tensor must be provided." + + if output_scale is not None or output_block_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported for ROCMAiterMLASparse" + ) + + if attn_metadata is None: + # The zero fill is required when used with DP + EP + # to ensure all ranks within a DP group compute the + # same expert outputs. + return output.fill_(0) + + num_actual_toks = attn_metadata.num_actual_tokens + + # Inputs and outputs may be padded for CUDA graphs + + q = q[:num_actual_toks, ...] + k_c_normed = k_c_normed[:num_actual_toks, ...] + k_pe = k_pe[:num_actual_toks, ...] + + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + if self.is_fp8bmm_enabled: + # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L) + ql_nope = rocm_aiter_ops.triton_fp8_bmm( + q_nope, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True + ) + else: + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + ql_nope = ql_nope.transpose(0, 1) + + topk_indices = self.topk_indices_buffer[:num_actual_toks] + + topk_indices_global = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token, + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=attn_metadata.topk_tokens, + ) + + q = torch.cat([ql_nope, q_pe], dim=-1) + + # write the latent and rope to kv cache + if kv_cache.numel() > 0: + ops.concat_and_cache_mla( + k_c_normed, + k_pe.squeeze(1), + kv_cache, + attn_metadata.slot_mapping.flatten(), + kv_cache_dtype=self.kv_cache_dtype, + scale=layer._k_scale, + ) + + attn_out = self._forward_bf16_kv( + q, kv_cache, topk_indices_global, attn_metadata + ) + + self._v_up_proj(attn_out, out=output[:num_actual_toks]) + return output diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 095407a8b959..9e99ea964ee0 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -316,7 +316,7 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda() or current_platform.is_xpu(): + if current_platform.is_cuda_alike() or current_platform.is_xpu(): # We know that the GPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. From c0c2dd1e0b75c70706f4d8dbcd1d75f1c1750e14 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 20 Nov 2025 12:55:10 +0200 Subject: [PATCH 240/578] [BugFix] kv_offloading: Fix bug in loading of partial cpu blocks (#28951) Signed-off-by: Or Ozeri Co-authored-by: Cyrus Leung --- tests/v1/kv_offload/test_cpu_gpu.py | 4 ++-- vllm/v1/kv_offload/worker/cpu_gpu.py | 20 +++++++++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 0d4fa344d298..a248104e16d2 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -103,8 +103,8 @@ def test_transfer( for i in range(gpu_blocks_per_cpu_block): cpu_blocks_in_gpu_block_size.append(i + base_block_id) - # maybe skip a GPU block to test writing to the middle of a CPU block - if gpu_to_cpu: + # maybe skip a GPU block to test reading from the middle of a CPU block + if not gpu_to_cpu: gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :] cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[ gpu_blocks_per_cpu_block - 1 : diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 0f2ec4a1b41f..111046377a5d 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -135,22 +135,20 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: assert src_blocks.ndim == 1 assert dst_blocks.ndim == 1 - dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor src_sub_block_count = src_blocks.size * src_block_size_factor + dst_sub_block_count = dst_blocks.size * dst_block_size_factor + src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor - assert ( - src_sub_block_count - == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip - ) + assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip - src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64) - expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0]) + src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64) expand_block_ids( - dst_blocks, - dst_block_size_factor, - src_to_dst[:, 1], - skip_count=dst_sub_blocks_to_skip, + src_blocks, + src_block_size_factor, + src_to_dst[:, 0], + skip_count=src_sub_blocks_to_skip, ) + expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1]) src_to_dst_tensor = torch.from_numpy(src_to_dst) event = self.events_pool.pop() if self.events_pool else torch.Event() From c9e093116c00781dda86df7a77e976c614b35d51 Mon Sep 17 00:00:00 2001 From: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:00:19 +0900 Subject: [PATCH 241/578] [MODEL] Implement plamo3 (#28834) Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> --- docs/models/supported_models.md | 1 + tests/distributed/test_pipeline_parallel.py | 1 + tests/models/registry.py | 4 + vllm/model_executor/models/plamo3.py | 431 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 5 files changed, 438 insertions(+) create mode 100644 vllm/model_executor/models/plamo3.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 80fe143269a7..f0531ced0aaa 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -436,6 +436,7 @@ th { | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | +| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 0ab94d30858f..89f035d2cdd6 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -130,6 +130,7 @@ def iter_params(self, model_id: str): "inceptionai/jais-13b-chat": PPTestSettings.fast(), "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "pfnet/plamo-2-1b": PPTestSettings.fast(), + "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(), "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(), # Tests TransformersForCausalLM "hmellor/Ilama-3.2-1B": PPTestSettings.fast(), diff --git a/tests/models/registry.py b/tests/models/registry.py index 094f921e4305..1999e3cd2de2 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -402,6 +402,10 @@ def check_available_online( "pfnet/plamo-2-1b", trust_remote_code=True, ), + "Plamo3ForCausalLM": _HfExamplesInfo( + "pfnet/plamo-3-nict-2b-base", + trust_remote_code=True, + ), "QWenLMHeadModel": _HfExamplesInfo( "Qwen/Qwen-7B-Chat", max_transformers_version="4.53", diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py new file mode 100644 index 000000000000..5bb07722a5fc --- /dev/null +++ b/vllm/model_executor/models/plamo3.py @@ -0,0 +1,431 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only PLaMo3 model.""" + +from collections.abc import Iterable +from itertools import islice +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + LoaderFunction, + composed_weight_loader, + default_weight_loader, +) +from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + extract_layer_index, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors + + +# Only used for type hinting. +class Plamo3Config(PretrainedConfig): # type: ignore + model_type: str = "plamo3" + + hidden_size: int + num_hidden_layers: int + rms_norm_eps: float + # Attention + num_attention_heads: int + head_dim: int + num_key_value_heads: int + # vllm rename `sliding_window` attr to `interleaved_sliding_window` + # if `sliding_window` is list + interleaved_sliding_window: list[int | None] + sliding_window_pattern: int + rope_theta: int + rope_local_theta: int + # MLP + intermediate_size: int + # Tokenizer + vocab_size: int + + +def rms_norm_weight_loader(offset: float) -> LoaderFunction: + return composed_weight_loader( + default_weight_loader, + lambda x: x + offset, + ) + + +class DenseMLP(nn.Module): + def __init__( + self, + config: Plamo3Config, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_up_proj = MergedColumnParallelLinear( + self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + prefix=f"{prefix}.gate_up_proj", + quant_config=quant_config, + return_bias=False, + ) + self.act = SiluAndMul() + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=False, + prefix=f"{prefix}.down_proj", + quant_config=quant_config, + return_bias=False, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + h = self.gate_up_proj(hidden_states) + h = self.act(h) + return self.down_proj(h) + + +class Plamo3AttentionMixer(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + layer_idx = extract_layer_index(prefix) + full_attn = config.interleaved_sliding_window[layer_idx] is None + + self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta + self.rope_scaling = ( + config.rope_scaling if hasattr(config, "rope_scaling") else None + ) + max_position = config.max_position_embeddings + if hasattr(vllm_config.model_config, "max_model_len") and isinstance( + vllm_config.model_config.max_model_len, int + ): + max_position = min(max_position, vllm_config.model_config.max_model_len) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + rope_scaling=self.rope_scaling, + ) + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + set_weight_attrs( + self.q_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)} + ) + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + set_weight_attrs( + self.k_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)} + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=vllm_config.cache_config, + per_layer_sliding_window=config.interleaved_sliding_window[layer_idx], + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs: Any, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + q_shape = q.shape + q = q.reshape(q_shape[:-1] + (q_shape[-1] // self.head_dim, self.head_dim)) + q = self.q_norm.forward_native(q).reshape(q_shape) + k_shape = k.shape + k = k.reshape(k_shape[:-1] + (k_shape[-1] // self.head_dim, self.head_dim)) + k = self.k_norm.forward_native(k).reshape(k_shape) + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Plamo3DecoderLayer(nn.Module): + def __init__( + self, vllm_config: VllmConfig, prefix: str = "", **kwargs: Any + ) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.mixer = Plamo3AttentionMixer( + vllm_config=vllm_config, + prefix=f"{prefix}.mixer", + ) + + self.mlp = DenseMLP( + config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" + ) + self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.pre_mixer_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.post_mixer_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0 / 5)}, + ) + self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.pre_mlp_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.post_mlp_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0 / (5**1.5))}, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs: Any, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if residual is None: + residual = hidden_states + hidden_states = self.pre_mixer_norm(hidden_states) + else: + hidden_states, residual = self.pre_mixer_norm(hidden_states, residual) + + hidden_states = self.mixer( + positions=positions, hidden_states=hidden_states, residual=residual + ) + hidden_states = self.post_mixer_norm(hidden_states) + # Fully Connected + hidden_states, residual = self.pre_mlp_norm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_norm(hidden_states) + return hidden_states, residual + + +class Plamo3Decoder(torch.nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers + + self.start_layer, self.end_layer, self.layers = make_layers( + num_hidden_layers, + lambda prefix: Plamo3DecoderLayer(vllm_config, prefix=prefix), + prefix=f"{prefix}.layers", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + return hidden_states, residual + + +@support_torch_compile +class Plamo3Model(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.org_vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + prefix=f"{prefix}.embed_tokens", + ) + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + self.layers = Plamo3Decoder(vllm_config, prefix=f"{prefix}.layers") + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + hidden_states, residual = self.layers( + positions=positions, hidden_states=hidden_states, residual=residual + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Plamo3ForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + self.config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.scheduler_config = vllm_config.scheduler_config + + self.model = Plamo3Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + self.vocab_size = self.config.vocab_size + self.unpadded_vocab_size = self.config.vocab_size + + num_embeddings = ((self.vocab_size + 15) // 16) * 16 + self.lm_head = ParallelLMHead( + num_embeddings, + self.config.hidden_size, + org_num_embeddings=self.config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=f"{prefix}.lm_head", + ) + if self.config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, self.config.vocab_size + ) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a2de597c87d8..494398760620 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -157,6 +157,7 @@ "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"), + "Plamo3ForCausalLM": ("plamo3", "Plamo3ForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), From 371b1d4c61335ed4c1d7fb2acee75274cc6d4551 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Thu, 20 Nov 2025 19:01:03 +0800 Subject: [PATCH 242/578] [RL] Add Pause and Resume Generation for Asynchronous RL Training (#28037) Signed-off-by: SamitHuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> Signed-off-by: samithuang <285365963@qq.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/engine/protocol.py | 27 ++++++++++ vllm/entrypoints/openai/api_server.py | 78 +++++++++++++++++++++++++++ vllm/v1/engine/async_llm.py | 64 ++++++++++++++++++++++ vllm/v1/engine/output_processor.py | 13 +++++ 4 files changed, 182 insertions(+) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 462d2c4e50e7..5e3374f9f6a1 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -149,6 +149,33 @@ async def add_lora(self, lora_request: LoRARequest) -> bool: """Load a new LoRA adapter into the engine for future requests.""" ... + @abstractmethod + async def pause_generation( + self, + *, + wait_for_inflight_requests: bool = False, + clear_cache: bool = True, + ) -> None: + """Pause new generation/encoding requests. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight requests + to finish before pausing. When ``False`` (default), aborts in-flight + requests immediately. + clear_cache: Whether to clear KV and prefix caches after draining. + """ + ... + + @abstractmethod + async def resume_generation(self) -> None: + """Resume accepting generation/encoding requests.""" + ... + + @abstractmethod + async def is_paused(self) -> bool: + """Return whether the engine is currently paused.""" + ... + async def scale_elastic_ep( self, new_data_parallel_size: int, drain_timeout: int = 300 ) -> None: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3974f45a7135..70174250ceab 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -394,6 +394,84 @@ async def get_server_load_metrics(request: Request): return JSONResponse(content={"server_load": request.app.state.server_load_metrics}) +@router.post("/pause") +async def pause_generation( + raw_request: Request, + wait_for_inflight_requests: bool = Query(False), + clear_cache: bool = Query(True), +) -> JSONResponse: + """Pause generation requests to allow weight updates. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight + requests to finish before pausing. When ``False`` (default), + aborts any in-flight requests immediately. + clear_cache: Whether to clear KV/prefix caches after draining. + """ + + engine = engine_client(raw_request) + + try: + await engine.pause_generation( + wait_for_inflight_requests=wait_for_inflight_requests, + clear_cache=clear_cache, + ) + return JSONResponse( + content={"status": "paused"}, + status_code=HTTPStatus.OK.value, + ) + + except ValueError as err: + return JSONResponse( + content={"error": str(err)}, + status_code=HTTPStatus.BAD_REQUEST.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to pause generation") + return JSONResponse( + content={"error": f"Failed to pause generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.post("/resume") +async def resume_generation(raw_request: Request) -> JSONResponse: + """Resume generation after a pause.""" + + engine = engine_client(raw_request) + + try: + await engine.resume_generation() + return JSONResponse( + content={"status": "resumed"}, + status_code=HTTPStatus.OK.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to resume generation") + return JSONResponse( + content={"error": f"Failed to resume generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.get("/is_paused") +async def is_paused(raw_request: Request) -> JSONResponse: + """Return the current pause status.""" + + engine = engine_client(raw_request) + + try: + paused = await engine.is_paused() + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to fetch pause status") + return JSONResponse( + content={"error": f"Failed to fetch pause status: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + return JSONResponse(content={"is_paused": paused}) + + @router.post( "/tokenize", dependencies=[Depends(validate_json_request)], diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index abf2c8cfa453..c64b3cccfc65 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -152,6 +152,10 @@ def __init__( ) self.logger_manager.log_engine_initialized() + # Pause / resume state for async RL workflows. + self._pause_cond = asyncio.Condition() + self._paused = False + self.output_handler: asyncio.Task | None = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -404,6 +408,10 @@ async def generate( # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + # Wait until generation is resumed if the engine is paused. + async with self._pause_cond: + await self._pause_cond.wait_for(lambda: not self._paused) + if tokenization_kwargs is None: tokenization_kwargs = {} truncate_prompt_tokens = sampling_params.truncate_prompt_tokens @@ -551,6 +559,58 @@ async def abort(self, request_id: str | Iterable[str]) -> None: if self.log_requests: logger.info("Aborted request(s) %s.", ",".join(request_ids)) + async def pause_generation( + self, + *, + wait_for_inflight_requests: bool = False, + clear_cache: bool = True, + ) -> None: + """ + Pause generation to allow model weight updates. + + New generation/encoding requests are blocked until resume. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight + requests to finish before pausing. When ``False`` (default), + immediately aborts any in-flight requests. + clear_cache: Whether to clear KV cache and prefix cache after + draining. Set to ``False`` to preserve cache for faster resume. + Default is ``True`` (clear caches). + """ + + async with self._pause_cond: + if self._paused: + return + self._paused = True + + if not wait_for_inflight_requests: + request_ids = list(self.output_processor.request_states.keys()) + if request_ids: + await self.abort(request_ids) + + # Wait for running requests to drain before clearing cache. + if self.output_processor.has_unfinished_requests(): + await self.output_processor.wait_for_requests_to_drain() + + # Clear cache + if clear_cache: + await self.reset_prefix_cache() + await self.reset_mm_cache() + + async def resume_generation(self) -> None: + """Resume generation after :meth:`pause_generation`.""" + + async with self._pause_cond: + self._paused = False + self._pause_cond.notify_all() # Wake up all waiting requests + + async def is_paused(self) -> bool: + """Return whether the engine is currently paused.""" + + async with self._pause_cond: + return self._paused + async def encode( self, prompt: PromptType, @@ -582,6 +642,10 @@ async def encode( # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + # Respect pause state before accepting new requests. + async with self._pause_cond: + await self._pause_cond.wait_for(lambda: not self._paused) + if tokenization_kwargs is None: tokenization_kwargs = {} _validate_truncation_size( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index bdbbfe2595f8..0453c4a77f0c 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -350,6 +350,8 @@ def __init__( self.parent_requests: dict[str, ParentRequest] = {} self.lora_states = LoRARequestStates(log_stats) self.tracer: Tracer | None = None + self._requests_drained = asyncio.Event() + self._requests_drained.set() def get_num_unfinished_requests(self): return len(self.request_states) @@ -357,6 +359,11 @@ def get_num_unfinished_requests(self): def has_unfinished_requests(self) -> bool: return len(self.request_states) > 0 + async def wait_for_requests_to_drain(self) -> None: + if not self.request_states: + return + await self._requests_drained.wait() + def propagate_error(self, e: Exception): """Propagate error to all generate() tasks.""" @@ -396,6 +403,8 @@ def abort_requests( child_reqs = self.abort_requests(child_reqs) request_ids_to_abort.extend(child_reqs) self.parent_requests.pop(request_id, None) + if not self.request_states: + self._requests_drained.set() return request_ids_to_abort def add_request( @@ -420,6 +429,8 @@ def add_request( log_stats=self.log_stats, stream_interval=self.stream_interval, ) + if self._requests_drained.is_set(): + self._requests_drained.clear() self.request_states[request_id] = req_state if parent_req: self.parent_requests[parent_req.request_id] = parent_req @@ -511,6 +522,8 @@ def process_outputs( parent_req = req_state.parent_req if parent_req and not parent_req.child_requests: self.parent_requests.pop(parent_req.request_id, None) + if not self.request_states: + self._requests_drained.set() if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer # detected stop string, abort needed in EngineCore. From 93c8672ceb06f6e9c282a96fcd85a7ce41293693 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Thu, 20 Nov 2025 03:05:50 -0800 Subject: [PATCH 243/578] [Bugfix] Fix spec decode memory regression after #28549 (#28819) Signed-off-by: zhewenli --- vllm/model_executor/models/deepseek_eagle.py | 5 ----- vllm/model_executor/models/llama4_eagle.py | 7 ------- vllm/model_executor/models/llama_eagle.py | 5 ----- vllm/v1/spec_decode/eagle.py | 7 +++++-- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 3fb04c3b70dd..4d7a37292cb0 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -8,7 +8,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -172,10 +171,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue - # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index 660c8f1bb522..0146b3057928 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -23,7 +23,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -127,17 +126,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: weight_loader(param, loaded_weight, shard_id) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) for name in params_dict: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue assert name in loaded_params, f"{name} is not loaded!" return loaded_params diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 90ab5c50361b..05cb456e7776 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -9,7 +9,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -155,10 +154,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: weight_loader(param, loaded_weight, shard_id) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue - param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 406bb696bd4c..ba37bc81607f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1028,8 +1028,11 @@ def load_model(self, target_model: nn.Module) -> None: elif ( isinstance(target_embed_tokens.weight, torch.Tensor) and isinstance(self.model.model.embed_tokens.weight, torch.Tensor) - and torch.equal( - target_embed_tokens.weight, self.model.model.embed_tokens.weight + and torch.allclose( + target_embed_tokens.weight.cpu(), + self.model.model.embed_tokens.weight.cpu(), + rtol=1e-5, + atol=1e-7, ) ): share_embeddings = True From a2e9ebe9e242295a58e400835ef98a14b29c4fb0 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Thu, 20 Nov 2025 20:14:29 +0800 Subject: [PATCH 244/578] [BugFix] Fix flash_attn import in `siglip2navit.py` (#29082) Signed-off-by: Fanli Lin --- vllm/model_executor/models/siglip2navit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index 46f5e67d659e..c185b45345bd 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -191,7 +191,7 @@ def apply_rotary_pos_emb( cos = cos.chunk(2, dim=-1)[0].contiguous() sin = sin.chunk(2, dim=-1)[0].contiguous() if is_flash_attn_backend and not current_platform.is_xpu(): - from flash_attn.layers.rotary import apply_rotary_emb + from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb apply_rotary_emb_func = apply_rotary_emb else: From 82b05b15e61badfd0c5912d4c3eebc88043c9ef8 Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 20 Nov 2025 23:34:11 +0700 Subject: [PATCH 245/578] [BugFix] [FEAT] Enable fastsafetensors for ROCm platform (#28225) Signed-off-by: tjtanaa --- requirements/rocm.txt | 1 + .../fastsafetensors_loader/test_fastsafetensors_loader.py | 3 ++- .../model_loader/fastsafetensors_loader/test_weight_utils.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 6f1cca90e5e2..abbd33d6e124 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -15,3 +15,4 @@ setuptools-scm>=8 runai-model-streamer[s3,gcs]==0.15.0 conch-triton-kernels==1.2.1 timm>=1.0.17 +fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py index f154df6dfc23..c5b3c731ffc6 100644 --- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py +++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py @@ -19,7 +19,8 @@ @pytest.mark.skipif( - not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs" + not current_platform.is_cuda_alike(), + reason="fastsafetensors requires NVIDIA/AMD GPUs", ) def test_model_loader_download_files(vllm_runner): with vllm_runner(test_model, load_format="fastsafetensors") as llm: diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py index bd216f0e41a4..1975eb61b25d 100644 --- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py +++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py @@ -17,7 +17,8 @@ @pytest.mark.skipif( - not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs" + not current_platform.is_cuda_alike(), + reason="fastsafetensors requires NVIDIA/AMD GPUs", ) def test_fastsafetensors_model_loader(): with tempfile.TemporaryDirectory() as tmpdir: From 56f45eddaff817ec7118bf9a73c5e4b560738bed Mon Sep 17 00:00:00 2001 From: rookie <66160395+zhanggzh@users.noreply.github.com> Date: Fri, 21 Nov 2025 01:02:30 +0800 Subject: [PATCH 246/578] [Frontend] Optimize beam search loop by sorting and then splicing (#19347) Signed-off-by: zhangguozhu Signed-off-by: mgoin Co-authored-by: zhangguozhu Co-authored-by: mgoin --- vllm/entrypoints/openai/serving_engine.py | 103 +++++++++++++++------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c50b0c4a23e1..127b8e6dcb87 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -10,6 +10,7 @@ from http import HTTPStatus from typing import Any, ClassVar, Generic, TypeAlias, TypeVar +import numpy as np import torch from fastapi import Request from pydantic import BaseModel, ConfigDict, Field, TypeAdapter @@ -389,8 +390,9 @@ async def beam_search( sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty) + logprobs_num = 2 * beam_width beam_search_params = SamplingParams( - logprobs=2 * beam_width, + logprobs=logprobs_num, max_tokens=1, temperature=temperature, ) @@ -443,40 +445,75 @@ async def beam_search( output = [x[0] for x in await asyncio.gather(*tasks)] new_beams = [] - for i, current_beam in enumerate(all_beams): - result = output[i] - + # Store all new tokens generated by beam + all_beams_token_id = [] + # Store the cumulative probability of all tokens + # generated by beam search + all_beams_logprob = [] + # Iterate through all beam inference results + for i, result in enumerate(output): + current_beam = all_beams[i] if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] - for token_id, logprob_obj in logprobs.items(): - if token_id == eos_token_id and not ignore_eos: - completed.append( - BeamSearchSequence( - tokens=current_beam.tokens + [token_id] - if include_stop_str_in_output - else current_beam.tokens, - logprobs=current_beam.logprobs + [logprobs], - cum_logprob=current_beam.cum_logprob - + logprob_obj.logprob, - finish_reason="stop", - stop_reason=eos_token_id, - ) - ) - else: - new_beams.append( - BeamSearchSequence( - tokens=current_beam.tokens + [token_id], - logprobs=current_beam.logprobs + [logprobs], - lora_request=current_beam.lora_request, - cum_logprob=current_beam.cum_logprob - + logprob_obj.logprob, - multi_modal_data=current_beam.multi_modal_data, - mm_processor_kwargs=current_beam.mm_processor_kwargs, - ) - ) - - sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True) - all_beams = sorted_beams[:beam_width] + all_beams_token_id.extend(list(logprobs.keys())) + all_beams_logprob.extend( + [ + current_beam.cum_logprob + obj.logprob + for obj in logprobs.values() + ] + ) + + # Handle the token for the end of sentence (EOS) + all_beams_token_id = np.array(all_beams_token_id) + all_beams_logprob = np.array(all_beams_logprob) + + if not ignore_eos: + # Get the index position of eos token in all generated results + eos_idx = np.where(all_beams_token_id == eos_token_id)[0] + for idx in eos_idx: + current_beam = all_beams[idx // logprobs_num] + result = output[idx // logprobs_num] + assert result.outputs[0].logprobs is not None + logprobs_entry = result.outputs[0].logprobs[0] + completed.append( + BeamSearchSequence( + tokens=current_beam.tokens + [eos_token_id] + if include_stop_str_in_output + else current_beam.tokens, + logprobs=current_beam.logprobs + [logprobs_entry], + cum_logprob=float(all_beams_logprob[idx]), + finish_reason="stop", + stop_reason=eos_token_id, + ) + ) + # After processing, set the log probability of the eos condition + # to negative infinity. + all_beams_logprob[eos_idx] = -np.inf + + # Processing non-EOS tokens + # Get indices of the top beam_width probabilities + topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[ + :beam_width + ] + + for idx in topn_idx: + current_beam = all_beams[idx // logprobs_num] + result = output[idx // logprobs_num] + token_id = int(all_beams_token_id[idx]) + assert result.outputs[0].logprobs is not None + logprobs_entry = result.outputs[0].logprobs[0] + new_beams.append( + BeamSearchSequence( + tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs_entry], + lora_request=current_beam.lora_request, + cum_logprob=float(all_beams_logprob[idx]), + multi_modal_data=current_beam.multi_modal_data, + mm_processor_kwargs=current_beam.mm_processor_kwargs, + ) + ) + + all_beams = new_beams completed.extend(all_beams) sorted_completed = sorted(completed, key=sort_beams_key, reverse=True) From 22924383e14a7a37ee86cf6e15f39e13efc86f7c Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:07:06 -0600 Subject: [PATCH 247/578] Updating the mirror of test-amd.yaml as of 2025-11-18 (#29016) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 41 ++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 37c6bd427672..4e2ff5c5a6bd 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -61,7 +61,7 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -73,6 +73,7 @@ steps: - tests/multimodal - tests/standalone_tests/lazy_imports.py - tests/transformers_utils + - tests/config no_gpu: true commands: - python3 standalone_tests/lazy_imports.py @@ -80,6 +81,7 @@ steps: - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -390,6 +392,15 @@ steps: commands: - pytest -v -s v1/attention +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -529,7 +540,7 @@ steps: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -694,7 +705,7 @@ steps: - vllm/model_executor/models/whisper.py commands: # LMEval # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + - pytest -s entrypoints/openai/correctness/ - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 @@ -995,12 +1006,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -1045,7 +1056,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -1066,7 +1077,9 @@ steps: - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1088,15 +1101,13 @@ steps: commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/test_fusions_e2e.py - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" agent_pool: mi325_1 - mirror_hardwares: [amdproduction] + mirror_hardwares: [amdexperimental, amdproduction] optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1416,7 +1427,9 @@ steps: - pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py From e5bfcb6a88cda4f91e3c7074d7e76bb5d1d36362 Mon Sep 17 00:00:00 2001 From: Pan Li <1162953505@qq.com> Date: Fri, 21 Nov 2025 01:38:31 +0800 Subject: [PATCH 248/578] [BugFix][PD]: make example proxy usable with P2pNcclConnector (#26628) Signed-off-by: PAN <1162953505@qq.com> --- .../disagg_prefill_proxy_server.py | 249 +++++++++++------- .../online_serving/disaggregated_prefill.sh | 19 +- 2 files changed, 169 insertions(+), 99 deletions(-) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 904f80534914..d072c03c440b 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -5,11 +5,12 @@ import asyncio import logging import os +import time +import uuid +from urllib.parse import urlparse import aiohttp from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue # Configure logging logging.basicConfig(level=logging.INFO) @@ -24,26 +25,8 @@ def parse_args(): parser.add_argument( "--timeout", type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", + default=6 * 60 * 60, + help="Timeout for backend service requests in seconds (default: 21600)", ) parser.add_argument( "--port", @@ -54,14 +37,32 @@ def parse_args(): parser.add_argument( "--prefill-url", type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", + default="http://localhost:8100", + help="Prefill service base URL (protocol + host[:port])", ) parser.add_argument( "--decode-url", type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", + default="http://localhost:8200", + help="Decode service base URL (protocol + host[:port])", + ) + parser.add_argument( + "--kv-host", + type=str, + default="localhost", + help="Hostname or IP used by KV transfer (default: localhost)", + ) + parser.add_argument( + "--prefill-kv-port", + type=int, + default=14579, + help="Prefill KV port (default: 14579)", + ) + parser.add_argument( + "--decode-kv-port", + type=int, + default=14580, + help="Decode KV port (default: 14580)", ) return parser.parse_args() @@ -73,70 +74,129 @@ def main(): # Initialize configuration using command line parameters AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit PREFILL_SERVICE_URL = args.prefill_url DECODE_SERVICE_URL = args.decode_url PORT = args.port - app = Quart(__name__) + PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}" + DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}" - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) + logger.info( + "Proxy resolved KV addresses -> prefill: %s, decode: %s", + PREFILL_KV_ADDR, + DECODE_KV_ADDR, + ) + + app = Quart(__name__) - # Attach the configuration object to the application instance + # Attach the configuration object to the application instance so helper + # coroutines can read the resolved backend URLs and timeouts without using + # globals. app.config.update( { "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + "PREFILL_KV_ADDR": PREFILL_KV_ADDR, + "DECODE_KV_ADDR": DECODE_KV_ADDR, } ) - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) - - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' + def _normalize_base_url(url: str) -> str: + """Remove any trailing slash so path joins behave predictably.""" + return url.rstrip("/") + + def _get_host_port(url: str) -> str: + """Return the hostname:port portion for logging and KV headers.""" + parsed = urlparse(url) + host = parsed.hostname or "localhost" + port = parsed.port + if port is None: + port = 80 if parsed.scheme == "http" else 443 + return f"{host}:{port}" + + PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) + DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) + KV_TARGET = _get_host_port(DECODE_SERVICE_URL) + + def _build_headers(request_id: str) -> dict[str, str]: + """Construct the headers expected by vLLM's P2P disagg connector.""" + headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + async def _run_prefill( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{PREFILL_BASE}{request_path}" + start_ts = time.perf_counter() + logger.info("[prefill] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + raise RuntimeError( + f"Prefill backend error {resp.status}: {error_text}" + ) + await resp.read() + logger.info( + "[prefill] done request_id=%s status=%s elapsed=%.2fs", + request_id, + resp.status, + time.perf_counter() - start_ts, + ) + except asyncio.TimeoutError as exc: + raise RuntimeError(f"Prefill service timeout at {url}") from exc + except aiohttp.ClientError as exc: + raise RuntimeError(f"Prefill service unavailable at {url}") from exc + + async def _stream_decode( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{DECODE_BASE}{request_path}" + # Stream tokens from the decode service once the prefill stage has + # materialized KV caches on the target workers. + logger.info("[decode] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + logger.error( + "Decode backend error %s - %s", resp.status, error_text + ) + err_msg = ( + '{"error": "Decode backend error ' + str(resp.status) + '"}' + ) + yield err_msg.encode() + return + logger.info( + "[decode] streaming response request_id=%s status=%s", + request_id, + resp.status, + ) + async for chunk_bytes in resp.content.iter_chunked(1024): + yield chunk_bytes + logger.info("[decode] finished streaming request_id=%s", request_id) + except asyncio.TimeoutError: + logger.error("Decode service timeout at %s", url) + yield b'{"error": "Decode service timeout"}' + except aiohttp.ClientError as exc: + logger.error("Decode service error at %s: %s", url, exc) + yield b'{"error": "Decode service unavailable"}' async def process_request(): """Process a single request through prefill and decode stages""" @@ -146,13 +206,27 @@ async def process_request(): # Create prefill request (max_tokens=1) prefill_request = original_request_data.copy() prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue + # The request id encodes both KV socket addresses so the backend can + # shuttle tensors directly via NCCL once the prefill response + # completes. + request_id = ( + f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_" + f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}" + ) + + headers = _build_headers(request_id) + await _run_prefill(request.path, prefill_request, headers, request_id) # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) + # Pass the unmodified user request so the decode phase can continue + # sampling with the already-populated KV cache. + generator = _stream_decode( + request.path, original_request_data, headers, request_id + ) response = await make_response(generator) response.timeout = None # Disable timeout for streaming response return response @@ -168,23 +242,10 @@ async def process_request(): @app.route("/v1/completions", methods=["POST"]) async def handle_request(): """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - try: - # Return the response from the processing task - return await task + return await process_request() except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") + logger.warning("Request cancelled") return Response( response=b'{"error": "Request cancelled"}', status=503, diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index d434e22b1ae8..cd2f2e44a4d6 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -24,7 +24,14 @@ cleanup() { exit 0 } -export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + +if [[ -z "${VLLM_HOST_IP:-}" ]]; then + export VLLM_HOST_IP=127.0.0.1 + echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)" +else + echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}" +fi + # install quart first -- required for disagg prefill proxy serve if python3 -c "import quart" &> /dev/null; then @@ -38,7 +45,7 @@ fi wait_for_server() { local port=$1 timeout 1200 bash -c " - until curl -s localhost:${port}/v1/completions > /dev/null; do + until curl -i localhost:${port}/v1/models > /dev/null; do sleep 1 done" && return 0 || return 1 } @@ -48,21 +55,23 @@ wait_for_server() { # prefilling instance, which is the KV producer CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ --port 8100 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' & -# decoding instance, which is the KV consumer +# decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ --port 8200 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' & # wait until prefill and decode instances are ready wait_for_server 8100 From 647464719b131963dccdc3a28cfe52d1af293cda Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 20 Nov 2025 20:09:59 +0200 Subject: [PATCH 249/578] [KVConnector][Core] Support cross-layer KV blocks (#27743) Signed-off-by: Or Ozeri --- .../unit/test_offloading_connector.py | 8 +- tests/v1/kv_offload/test_cpu_offloading.py | 145 +++++++++------ tests/v1/worker/test_gpu_model_runner.py | 5 +- vllm/attention/backends/abstract.py | 29 ++- .../kv_transfer/kv_connector/v1/base.py | 33 +++- .../kv_connector/v1/offloading_connector.py | 43 ++++- vllm/v1/attention/backends/flash_attn.py | 12 +- vllm/v1/attention/backends/flashinfer.py | 12 +- vllm/v1/attention/backends/mla/common.py | 9 + vllm/v1/attention/backends/mla/indexer.py | 6 +- vllm/v1/kv_offload/cpu.py | 17 +- vllm/v1/kv_offload/spec.py | 6 +- vllm/v1/kv_offload/worker/cpu_gpu.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 41 ++++- .../worker/kv_connector_model_runner_mixin.py | 165 ++++++++++++++++++ 15 files changed, 453 insertions(+), 90 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py index 23b6c4802d10..69565f584ab8 100644 --- a/tests/v1/kv_connector/unit/test_offloading_connector.py +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -19,6 +19,7 @@ ) from vllm.forward_context import ForwardContext from vllm.utils.hashing import sha256 +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.core.kv_cache_utils import ( BlockHash, get_request_block_hasher, @@ -92,7 +93,7 @@ def get_manager(self) -> OffloadingManager: return self.manager def get_handlers( - self, _ + self, _, __ ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler @@ -138,7 +139,10 @@ def __init__( self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER) # register worker kv_caches to enable OffloadingWorker creations - self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)}) + self.worker_connector.register_cross_layers_kv_cache( + kv_cache=torch.empty(0), + attn_backend=FlashAttentionBackend, + ) # extract connector of scheduler scheduler_connector = self.scheduler.connector diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index b654ea4298db..3ee41c40859d 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -12,8 +12,10 @@ from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.utils.system_utils import set_env_var -CPU_BLOCK_SIZES = [16, 48] +CPU_BLOCK_SIZES = [48] +ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] class MockSubscriber: @@ -63,8 +65,88 @@ def close(self): self.sub.close() +def _latency_test(llm: LLM, subscriber: MockSubscriber): + sampling_params = SamplingParams(max_tokens=1) + + num_times_cpu_better_than_cold = 0 + num_tests = 10 + total_cold_time = 0.0 + total_gpu_hit_time = 0.0 + total_cpu_hit_time = 0.0 + prompt_token_ids = [0] * 10001 + for i in tqdm(range(num_tests), desc="Running tests"): + prompt_token_ids[0] = i + prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)] + + # run generation - this should trigger saving KV cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cold_time = time.time() - start_time + total_cold_time += cold_time + + # run generation again - should hit the GPU prefix cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + gpu_hit_time = time.time() - start_time + total_gpu_hit_time += gpu_hit_time + + # reset prefix cache to avoid GPU hit. + llm.reset_prefix_cache() + + assert subscriber.get_new_cpu_stored_events() + + # run generation again - this should trigger loading from CPU + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cpu_hit_time = time.time() - start_time + total_cpu_hit_time += cpu_hit_time + + if cpu_hit_time < cold_time: + num_times_cpu_better_than_cold += 1 + + print("Average times:") + print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms") + print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms") + print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms") + + assert num_times_cpu_better_than_cold >= 0.8 * num_tests + + +def _accuracy_test(llm: LLM, subscriber: MockSubscriber): + sampling_params = SamplingParams(max_tokens=1) + cpu_block_size = ( + llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[ + "block_size" + ] + ) + + subscriber.get_new_cpu_stored_events() + + # prepend prompt to be cpu block aligned + prompt = "Let's count to 10. One, two, three, four," + while ( + len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size + != 0 + ): + prompt = ". " + prompt + + assert subscriber.get_new_cpu_stored_events() + + test_count = 100 + success_count = 0 + for i in range(test_count): + if ( + llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text + == " five" + ): + success_count += 1 + + assert success_count >= 0.5 * test_count + + @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) -def test_cpu_offloading(cpu_block_size: int) -> None: +@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS) +def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None: """ Tests OffloadingConnector with CPUOffloadingSpec. """ @@ -92,61 +174,20 @@ def test_cpu_offloading(cpu_block_size: int) -> None: topic="test", ) - llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - gpu_memory_utilization=0.5, - kv_events_config=kv_events_config, - kv_transfer_config=kv_transfer_config, - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=1) + with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend): + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_events_config=kv_events_config, + kv_transfer_config=kv_transfer_config, + ) events_endpoint = events_endpoint.replace("*", "127.0.0.1") subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic) try: - num_times_cpu_better_than_cold = 0 - num_tests = 10 - total_cold_time = 0.0 - total_gpu_hit_time = 0.0 - total_cpu_hit_time = 0.0 - prompt_token_ids = [0] * 10001 - for i in tqdm(range(num_tests), desc="Running tests"): - prompt_token_ids[0] = i - prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)] - - # run generation - this should trigger saving KV cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cold_time = time.time() - start_time - total_cold_time += cold_time - - # run generation again - should hit the GPU prefix cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - gpu_hit_time = time.time() - start_time - total_gpu_hit_time += gpu_hit_time - - # reset prefix cache to avoid GPU hit. - llm.reset_prefix_cache() - - assert subscriber.get_new_cpu_stored_events() - - # run generation again - this should trigger loading from CPU - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cpu_hit_time = time.time() - start_time - total_cpu_hit_time += cpu_hit_time - - if cpu_hit_time < cold_time: - num_times_cpu_better_than_cold += 1 - - print("Average times:") - print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms") - print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms") - print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms") - - assert num_times_cpu_better_than_cold >= 0.8 * num_tests + _latency_test(llm, subscriber) + _accuracy_test(llm, subscriber) finally: subscriber.close() del llm diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 824e45897835..01c1364f7ee6 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -483,7 +483,10 @@ def test_kv_cache_stride_order(monkeypatch, model_runner): # Permutation that gets you back to expected kv shape for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)): - def rnd_stride_order(test_stride=test_stride): + def rnd_stride_order( + include_num_layers_dimension: bool = False, test_stride=test_stride + ): + assert not include_num_layers_dimension return test_stride # Patch the attention backend class and re-trigger the KV cache creation diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 188becb6ad6f..67ded8847524 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -76,7 +76,34 @@ def get_kv_cache_shape( raise NotImplementedError @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + """ + Get the physical (memory layout) ordering of the kv cache dimensions. + e.g. if the KV cache shape is + [2, num_blocks, block_size, num_heads, head_size], + and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical + ordering of dimensions is + [num_blocks, num_heads, 2, block_size, head_size]. + + If this function is unimplemented / raises NotImplementedError, + the physical layout of the KV cache will match the logical shape. + + Args: + include_num_layers_dimension: if True, includes an additional + num_layers dimension, which is assumed to be prepended + to the logical KV cache shape. + With the above example, a return value (2, 4, 0, 1, 3, 5) + corresponds to + [num_blocks, num_heads, num_layers, 2, block_size, head_size]. + + If an additional dimension is NOT included in the returned + tuple, the physical layout will not include a layers dimension. + + Returns: + A tuple of ints which is a permutation of range(len(shape)). + """ raise NotImplementedError @classmethod diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index f85eb414b222..74f09278b7bb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -38,7 +38,7 @@ import enum from abc import ABC, abstractmethod from collections.abc import Callable, Iterable -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional import torch @@ -47,7 +47,7 @@ from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata + from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( @@ -142,6 +142,18 @@ class KVConnectorMetadata(ABC): # noqa: B024 class KVConnectorBase_V1(ABC): + """ + Base class for KV connectors. + + Attributes: + prefer_cross_layer_blocks (bool): Indicates whether this connector + prefers KV blocks that hold KV data for all layers (for speeding + up KV data transfers). + Defaults to False. + """ + + prefer_cross_layer_blocks: ClassVar[bool] = False + def __init__( self, vllm_config: "VllmConfig", @@ -226,6 +238,23 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): """ return + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"] + ): + """ + Initialize with a single KV cache tensor used by all layers. + The first dimension should be num_layers. + This function will only be called for models with uniform layers, + and only if the prefers_cross_layer_blocks is set to True. + Only one of the functions + {register_kv_caches, register_cross_layers_kv_cache} will be called. + + Args: + kv_cache: a cross-layers kv cache tensor + attn_backend: The attention backend that corresponds to all layers + """ + return + def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp): """ Set the xPU-specific ops for copying KV between host and device. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 582e42cc466a..8cd09014cab1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -4,12 +4,12 @@ from collections.abc import Iterable, Iterator from dataclasses import dataclass from itertools import islice -from typing import Any +from typing import Any, ClassVar import torch -from vllm.attention import AttentionMetadata -from vllm.config import VllmConfig +from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, @@ -42,6 +42,8 @@ class OffloadingConnectorMetadata(KVConnectorMetadata): class OffloadingConnector(KVConnectorBase_V1): + prefer_cross_layer_blocks: ClassVar[bool] = True + def __init__( self, vllm_config: VllmConfig, @@ -63,6 +65,12 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): assert self.connector_worker is not None self.connector_worker.register_kv_caches(kv_caches) + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] + ): + assert self.connector_worker is not None + self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend) + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: assert self.connector_worker is not None assert isinstance(self._connector_metadata, OffloadingConnectorMetadata) @@ -422,10 +430,35 @@ def _generate_job_id(self) -> int: self._job_counter = job_id + 1 return job_id - def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): - for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches): + def _register_handlers( + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]], + ): + for src_cls, dst_cls, handler in self.spec.get_handlers( + kv_caches, attn_backends + ): self.worker.register_handler(src_cls, dst_cls, handler) + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + layer_names = list(kv_caches.keys()) + layers = get_layers_from_vllm_config( + self.spec.vllm_config, Attention, layer_names + ) + attn_backends = { + layer_name: layers[layer_name].get_attn_backend() + for layer_name in layer_names + } + self._register_handlers(kv_caches, attn_backends) + + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] + ): + cross_layer_name = "ALL_LAYERS" + kv_caches = {cross_layer_name: kv_cache} + attn_backends = {cross_layer_name: attn_backend} + self._register_handlers(kv_caches, attn_backends) + def start_load_kv(self, metadata: OffloadingConnectorMetadata): for req_id, transfer_spec in metadata.reqs_to_load.items(): job_id = self._generate_job_id() diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index cf3c1d05f5b3..9fa6b1dfd19d 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -99,12 +99,20 @@ def get_kv_cache_shape( return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: # `stride_order` indicates the permutation that gets # us from `get_kv_cache_shape` to the actual memory layout we want. cache_layout = get_kv_cache_layout() - if cache_layout == "NHD": + if cache_layout == "NHD" and include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (2, 0, 1, 3, 4, 5) + elif cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) + elif cache_layout == "HND" and include_num_layers_dimension: + # (num_blocks, num_kv_heads, num_layers, 2, block_size, head_size) + return (2, 4, 0, 1, 3, 5) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 4da1637d96eb..3ad7e8c52fc1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -309,12 +309,20 @@ def get_kv_cache_shape( return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: # `stride_order` indicates the permutation that gets us from # `get_kv_cache_shape` to the actual memory layout we want. cache_layout = get_kv_cache_layout() - if cache_layout == "NHD": + if cache_layout == "NHD" and include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (1, 0, 2, 3, 4, 5) + elif cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) + elif cache_layout == "HND" and include_num_layers_dimension: + # (num_blocks, 2, num_kv_heads, num_layers, block_size, head_size) + return (1, 2, 4, 0, 3, 5) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 32f406980f2e..43aef8a7cca9 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -308,6 +308,15 @@ def get_kv_cache_shape( ) -> tuple[int, ...]: return (num_blocks, block_size, head_size) + @staticmethod + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + # `stride_order` indicates the permutation that gets + # us from `get_kv_cache_shape` to the actual memory layout we want. + # (num_blocks, num_layers, block_size, head_size) + return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2) + @classmethod def get_supported_head_sizes(cls) -> list[int]: return [576] diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index cc0988435768..d38361e0fcbf 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -48,7 +48,11 @@ def get_kv_cache_shape( return (num_blocks, block_size, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + if include_num_layers_dimension: + return (0, 1, 2, 3) return (0, 1, 2) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 4b1bbe6f0cc2..86747299eb10 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,8 +4,8 @@ import torch -from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.attention import AttentionBackend +from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager @@ -63,7 +63,9 @@ def get_manager(self) -> OffloadingManager: return self._manager def get_handlers( - self, kv_caches: dict[str, torch.Tensor] + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: if not current_platform.is_cuda_alike(): @@ -71,15 +73,6 @@ def get_handlers( "CPU Offloading is currently only supported on CUDA-alike GPUs" ) - layer_names = list(kv_caches.keys()) - layers = get_layers_from_vllm_config( - self.vllm_config, AttentionLayerBase, layer_names - ) - attn_backends = { - layer_name: layers[layer_name].get_attn_backend() - for layer_name in layer_names - } - self._handler = CpuGpuOffloadingHandler( attn_backends=attn_backends, gpu_block_size=self.gpu_block_size, diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index a3c539a47d45..c1813a4ff4ea 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -11,6 +11,7 @@ from vllm.v1.kv_offload.worker.worker import OffloadingHandler if TYPE_CHECKING: + from vllm.attention import AttentionBackend from vllm.config import VllmConfig logger = init_logger(__name__) @@ -48,13 +49,16 @@ def get_manager(self) -> OffloadingManager: @abstractmethod def get_handlers( - self, kv_caches: dict[str, torch.Tensor] + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type["AttentionBackend"]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: """ Get offloading handlers along with their respective src and dst types. Args: kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor. + attn_backends: A dictionary of layer_name -> AttentionBackend. Yields: Tuples of (src_type, dst_type, offloading_handler). diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 111046377a5d..bb163f0043fc 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -83,10 +83,18 @@ def __init__( self.gpu_tensors.append(gpu_tensor) gpu_shape = gpu_tensor.shape - test_shape = attn_backends[layer_name].get_kv_cache_shape( + attn_backend = attn_backends[layer_name] + test_shape = attn_backend.get_kv_cache_shape( num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256 ) - if test_shape[0] == 1234: + + if len(gpu_shape) != len(test_shape): + # cross-layers tensor + # shape is (num_blocks, ...) + assert len(gpu_shape) == len(test_shape) + 1 + num_blocks_idx = 0 + self.kv_dim_before_num_blocks.append(False) + elif test_shape[0] == 1234: # shape is (num_blocks, ...) num_blocks_idx = 0 self.kv_dim_before_num_blocks.append(False) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0490ed39c8c7..4b0a08ab57e1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -349,6 +349,9 @@ def __init__( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] + # Initialize in initialize_kv_cache_tensors + self.cross_layers_kv_cache: torch.Tensor | None = None + self.cross_layers_attn_backend: type[AttentionBackend] | None = None # indexes: [kv_cache_group_id][attn_group] self.attn_groups: list[list[AttentionGroup]] = [] # self.kv_cache_config: KVCacheConfig @@ -4930,12 +4933,30 @@ def initialize_kv_cache_tensors( Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ - # Initialize the memory buffer for KV cache - kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) - # Change the memory buffer to the desired shape - kv_caches = self._reshape_kv_cache_tensors( - kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes - ) + + # Try creating KV caches optimized for kv-connector transfers + cache_dtype = self.cache_config.cache_dtype + if self.use_uniform_kv_cache(self.attn_groups, cache_dtype): + kv_caches, cross_layers_kv_cache, attn_backend = ( + self.allocate_uniform_kv_caches( + kv_cache_config, + self.attn_groups, + cache_dtype, + self.device, + kernel_block_sizes, + ) + ) + self.cross_layers_kv_cache = cross_layers_kv_cache + self.cross_layers_attn_backend = attn_backend + else: + # Fallback to the general case + # Initialize the memory buffer for KV cache + kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) + + # Change the memory buffer to the desired shape + kv_caches = self._reshape_kv_cache_tensors( + kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes + ) # Set up cross-layer KV cache sharing for layer_name, target_layer_name in self.shared_kv_cache_layers.items(): @@ -5017,7 +5038,13 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: if has_kv_transfer_group(): kv_transfer_group = get_kv_transfer_group() - kv_transfer_group.register_kv_caches(kv_caches) + if self.cross_layers_kv_cache is not None: + assert self.cross_layers_attn_backend is not None + kv_transfer_group.register_cross_layers_kv_cache( + self.cross_layers_kv_cache, self.cross_layers_attn_backend + ) + else: + kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) if self.dcp_world_size > 1: diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index db037a9fccd5..e59361f21372 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -11,7 +11,11 @@ TYPE_CHECKING, # noqa: UP035 ) +import torch + +from vllm.attention import AttentionBackend from vllm.config import VllmConfig +from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( ensure_kv_transfer_shutdown, get_kv_transfer_group, @@ -21,11 +25,13 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger +from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.outputs import ( EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, ModelRunnerOutput, ) +from vllm.v1.worker.utils import AttentionGroup if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -142,3 +148,162 @@ def get_kv_connector_stats() -> KVConnectorStats | None: if has_kv_transfer_group(): return get_kv_transfer_group().get_kv_connector_stats() return None + + @staticmethod + def use_uniform_kv_cache( + attn_groups: list[list[AttentionGroup]], + cache_dtype: CacheDType, + ) -> bool: + """ + Determines whether a uniform KV layout should be used. + A uniform layout means all layers KV caches will share the same + underlying tensor, where for a given block number, the respective + KV data for all layers will be contiguous. + This will allow efficient KV transfer of per-block KV data for all + layers at once. + Note this layout will only be applied given 3 conditions: + 1. The KV Cache config contains just a single group where all layers + have the same page size. + 2. A KV connector is configured, and the KV connector instance prefers + to use this layout (prefer_cross_layer_blocks() returns True) + 2. The flash attention backend supports this layout + (get_kv_cache_stride_order(True) includes a placement for a + num_layers dimension) + + Note that the actual placement of the num_layers dimensions + in the unified layers tensors will be determined by the attention + backend. + Thus, the layers KV data may still not be contiguous per block + if the attention backend does not support it. + + Args: + attn_groups: The list of attention groups for this model + cache_dtype: The KV cache dtype + Returns: + True if we should use a uniform KV cache layout. + """ + + if not has_kv_transfer_group(): + return False + if not get_kv_transfer_group().prefer_cross_layer_blocks: + return False + + if len(attn_groups) != 1 or len(attn_groups[0]) != 1: + return False + + attn_group = attn_groups[0][0] + kv_cache_spec = attn_group.kv_cache_spec + if not isinstance(kv_cache_spec, AttentionSpec): + return False + + attn_backend = attn_group.backend + kv_cache_shape = attn_backend.get_kv_cache_shape( + 1234, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + cache_dtype_str=cache_dtype, + ) + + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order( + include_num_layers_dimension=True + ) + except (AttributeError, NotImplementedError): + return False + + # check that attention backend include a layers dimension + return len(kv_cache_stride_order) == len(kv_cache_shape) + 1 + + @staticmethod + def allocate_uniform_kv_caches( + kv_cache_config: KVCacheConfig, + attn_groups: list[list[AttentionGroup]], + cache_dtype: CacheDType, + device: torch.device, + kernel_block_sizes: list[int], + ) -> tuple[dict[str, torch.Tensor], torch.Tensor, type[AttentionBackend]]: + """ + Initializes and reshapes KV caches for the simple case where all + layers have the same layout. + + This function assumes use_uniform_kv_cache() returned True. + + Args: + kv_cache_config: The KV cache config + attn_groups: The list of attention groups for this model + cache_dtype: The KV cache dtype + device: The torch device to allocate on. + kernel_block_sizes: The kernel block sizes for each KV cache group. + Returns: + A tuple (kv_caches, cross_layers_kv_cache, attn_backend) where: + kv_caches is a dict mapping between layer names to their + corresponding memory buffer for KV cache. + cross_layers_kv_cache is the cross layers kv cache tensor + attn_backend is the attention backend matching this tensor + """ + attn_group = attn_groups[0][0] + kv_cache_spec = attn_group.kv_cache_spec + assert isinstance(kv_cache_spec, AttentionSpec) + + tensor_sizes = set( + kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors + ) + assert len(tensor_sizes) == 1 + tensor_size = tensor_sizes.pop() + + page_size = kv_cache_spec.page_size_bytes + assert tensor_size % page_size == 0 + num_blocks = tensor_size // page_size + num_layers = len(kv_cache_config.kv_cache_tensors) + total_size = tensor_size * num_layers + + assert len(kernel_block_sizes) == 1 + kernel_block_size = kernel_block_sizes[0] + num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size + kernel_num_blocks = num_blocks * num_blocks_per_kv_block + + attn_backend = attn_group.backend + kv_cache_shape = attn_backend.get_kv_cache_shape( + kernel_num_blocks, + kernel_block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + cache_dtype_str=cache_dtype, + ) + + # prepend a num_layers dimension into the shape + kv_cache_shape = (num_layers,) + kv_cache_shape + + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order( + include_num_layers_dimension=True + ) + assert len(kv_cache_stride_order) == len(kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple(range(len(kv_cache_shape))) + + kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order) + + logger.info("Allocating a cross layer KV cache of shape %s", kv_cache_shape) + + # allocate one contiguous buffer for all layers + cross_layers_kv_cache = ( + torch.zeros(total_size, dtype=torch.int8, device=device) + .view(kv_cache_spec.dtype) + .view(kv_cache_shape) + ) + + # Maintain original KV shape view. + inv_order = [ + kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order)) + ] + permuted_kv_cache = cross_layers_kv_cache.permute(*inv_order) + + kv_caches = {} + for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors): + tensor = permuted_kv_cache[i] + for layer_name in kv_cache_tensor.shared_by: + kv_caches[layer_name] = tensor + + return kv_caches, cross_layers_kv_cache, attn_backend From 114b0e25004b7e7cf0a23dc65f407471bd5de7e8 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 20 Nov 2025 10:22:40 -0800 Subject: [PATCH 250/578] [chore] Update annotate release scripts (#29077) Signed-off-by: Kevin H. Luu --- .buildkite/scripts/annotate-release.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 56bb5cedaa0a..df805e085080 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -23,8 +23,8 @@ To download the wheel (by version): aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: @@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest rm vllm/vllm-openai:latest +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` -EOF \ No newline at end of file +EOF From 4d01b6428448225807e6605d04e37e29fe729b44 Mon Sep 17 00:00:00 2001 From: Software Developer <7852635+dsuhinin@users.noreply.github.com> Date: Thu, 20 Nov 2025 21:00:33 +0100 Subject: [PATCH 251/578] [Bugfix] - Add Trace Headers to Beam Search Path (#29100) Signed-off-by: dsuhinin --- vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + vllm/entrypoints/openai/serving_engine.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 59e1c8d53179..6cc685acd672 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -319,6 +319,7 @@ async def create_chat_completion( request_id=request_id, params=sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a114b77ebc16..1cfb45ef4036 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -216,6 +216,7 @@ async def create_completion( request_id=request_id, params=sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 127b8e6dcb87..7dab5dbacd28 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -343,6 +343,7 @@ async def beam_search( request_id: str, params: BeamSearchParams, lora_request: LoRARequest | None = None, + trace_headers: Mapping[str, str] | None = None, ) -> AsyncGenerator[RequestOutput, None]: beam_width = params.beam_width max_tokens = params.max_tokens @@ -437,6 +438,7 @@ async def beam_search( beam_search_params, request_id_item, lora_request=lora_req, + trace_headers=trace_headers, ) ) ) From 3d84ef9054af190ce68333be3e4d16fe928be754 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 14:39:49 -0600 Subject: [PATCH 252/578] [CI/Build][AMD] Skip if flash_attn_varlen_func not available in test_aiter_flash_attn.py (#29043) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/attention/test_aiter_flash_attn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 1dec46e33f22..8f58c470d217 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -6,6 +6,7 @@ import torch import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 +from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform NUM_HEADS = [(4, 4), (8, 2)] @@ -100,6 +101,8 @@ def test_varlen_with_paged_kv( num_blocks: int, q_dtype: torch.dtype | None, ) -> None: + if not is_flash_attn_varlen_func_available(): + pytest.skip("flash_attn_varlen_func required to run this test.") torch.set_default_device("cuda") current_platform.seed_everything(0) num_seqs = len(seq_lens) From 5e5a7eb16f121f05e19c8bdf88247744ab9d1b83 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 14:45:56 -0600 Subject: [PATCH 253/578] [CI/Build] Make test_attention_selector.py run tests on correct platform (#29064) Signed-off-by: Randall Smith Signed-off-by: rasmith Co-authored-by: Randall Smith Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/attention/test_attention_selector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 3b8e939300a2..9be56a33f76c 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -7,6 +7,7 @@ import torch from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform @@ -47,9 +48,11 @@ def clear_cache(): def generate_params(): + is_rocm = current_platform.is_rocm() params = [] + device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"] for use_mla in [True, False]: - for device in ["cuda", "hip", "cpu"]: + for device in device_list: backends = ( DEVICE_MLA_BACKENDS[device] if use_mla From 3fd74189db13c9793325d9a36539d891873d1ae4 Mon Sep 17 00:00:00 2001 From: Driss Guessous <32754868+drisspg@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:21:54 -0800 Subject: [PATCH 254/578] Fixes bench (#29058) Signed-off-by: drisspg --- vllm/compilation/caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 16e34c2711e9..63b7ad7279e3 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -12,6 +12,7 @@ import vllm.envs as envs from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config.utils import hash_factors from vllm.logger import init_logger try: @@ -138,7 +139,7 @@ def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]: factors = [] # 0. factors come from the env, for example, The values of # VLLM_PP_LAYER_PARTITION will affect the computation graph. - env_hash = envs.compute_hash() + env_hash = hash_factors(envs.compile_factors()) factors.append(env_hash) # 1. factors come from the vllm_config (it mainly summarizes how the From 8237ab8a2bed14bec5cafbec75033c8e1d54d852 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 15:35:14 -0600 Subject: [PATCH 255/578] [CI/Build] Skip lm-format-enforcer tests in test_struct_output_generate.py for now (#29021) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../llm/test_struct_output_generate.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index a7d769c8542a..316e152e7395 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -47,10 +47,34 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None), + pytest.param( + "mistralai/Ministral-8B-Instruct-2410", + "lm-format-enforcer", + "auto", + None, + marks=pytest.mark.skip( + reason=( + "Flaky: lm-format-enforcer intermittently returns" + "incomplete JSON." + "See https://github.com/noamgat/lm-format-enforcer/issues/169" + ) + ), + ), ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), - ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None), + pytest.param( + "Qwen/Qwen2.5-1.5B-Instruct", + "lm-format-enforcer", + "auto", + None, + marks=pytest.mark.skip( + reason=( + "Flaky: lm-format-enforcer intermittently returns" + "incomplete JSON." + "See https://github.com/noamgat/lm-format-enforcer/issues/169" + ) + ), + ), # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402 # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None), # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), From c7a29d2c8d07ce6188d0c4bb19df6fd1d0e9bc74 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 15:44:37 -0600 Subject: [PATCH 256/578] [CI/Build] Remove skip global cleanup in test_struct_output_generate.py (#29022) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 316e152e7395..a00600b87eca 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -121,7 +121,6 @@ def test_guided_decoding_deprecated(): assert sp1.structured_outputs == guided_decoding -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, @@ -626,7 +625,6 @@ def test_structured_output( ) -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 [ @@ -711,7 +709,6 @@ def test_structured_output_with_reasoning_matrices( jsonschema.validate(instance=output_json, schema=reasoning_schema) -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( unsupported_json_schema: dict[str, Any], @@ -758,7 +755,6 @@ def test_structured_output_auto_mode( assert isinstance(parsed_json, dict) -@pytest.mark.skip_global_cleanup def test_guidance_no_additional_properties(): llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", From dd39f91edb0588e2dd77eb55c758eb1e35907af8 Mon Sep 17 00:00:00 2001 From: Rob Mulla Date: Thu, 20 Nov 2025 19:05:59 -0500 Subject: [PATCH 257/578] [Doc] cleanup TPU documentation and remove outdated examples (#29048) Signed-off-by: Rob Mulla Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 6 +- docs/configuration/tpu.md | 111 ------------------ docs/features/README.md | 37 +++--- docs/features/quantization/README.md | 29 +++-- docs/models/hardware_supported_models/tpu.md | 34 ------ .../offline_inference/profiling_tpu/README.md | 70 ----------- .../profiling_tpu/profiling.py | 110 ----------------- examples/offline_inference/tpu.py | 58 --------- 8 files changed, 40 insertions(+), 415 deletions(-) delete mode 100644 docs/configuration/tpu.md delete mode 100644 docs/models/hardware_supported_models/tpu.md delete mode 100644 examples/offline_inference/profiling_tpu/README.md delete mode 100644 examples/offline_inference/profiling_tpu/profiling.py delete mode 100644 examples/offline_inference/tpu.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 3151ea0e2ec2..c8bf00efb237 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -24,14 +24,16 @@ nav: - deployment/integrations - Training: training - Configuration: - - configuration/README.md - configuration/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/ - Models: - models/supported_models.md - models/generative_models.md - models/pooling_models.md - models/extensions - - Hardware Supported Models: models/hardware_supported_models + - Hardware Supported Models: + - models/hardware_supported_models/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/ - Features: features - Developer Guide: - contributing/README.md diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md deleted file mode 100644 index 2d24c9c6e2e9..000000000000 --- a/docs/configuration/tpu.md +++ /dev/null @@ -1,111 +0,0 @@ -# TPU Optimization Tips - -This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload. - -## Get started - -Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/). - -### TPU workload sizing - -When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed. - -The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you: - -- KV cache size requirement per token and per request -- TPU/GPU memory consumed by the model weights -- TPU/GPU memory allocated for the KV cache -- Maximum \# of requests you can approximately set (--max-num-seqs) - -This approach serves as a general rule of thumb. - -#### Latency-throughput tradeoff - -As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency. - -`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request. - -Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload. - -In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput. - -#### Compilation and Caching - -Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process. - -To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used. - -Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs. - -Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling). - -#### Reducing compilation time - -This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`. - -### Optimize based on your data - -#### max-model-len vs. most-model-len - -![most_model_len](../assets/design/tpu/most_model_len.png) - -If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. - -For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`. - -The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time. - -#### Padding - -For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.) - -The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: - -1. the default exponential padding (pad to the nearest power of 2) -2. bucket padding (pad to the nearest linearly increasing bucket). - -When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`. - -For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]. - -The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320. - -However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. - -#### Quantization - -If possible, use the precision that matches the chip’s hardware acceleration: - -- v5e has int4/int8 hardware acceleration in the MXU -- v6e has int4/int8 hardware acceleration in the MXU - -Supported quantized formats and features in vLLM on TPU [Jul '25]: - -- INT8 W8A8 -- INT8 W8A16 -- FP8 KV cache -- [WIP] FP8 W8A8 -- [WIP] AWQ -- [WIP] FP4 W4A8 - -#### Parallelization - -Don't set TP to be less than the number of chips on a single-host deployment. - -Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types). - -### Tune your workloads - -Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. - -### Future Topics We'll Cover - -#### Profiling - -The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance. - -#### SPMD - -More details to come. - -**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.** diff --git a/docs/features/README.md b/docs/features/README.md index ad9de9ff8f36..5faf3768f321 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -59,20 +59,23 @@ th:not(:first-child) { ### Feature x Hardware -| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU | -|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------| -| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) | -| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | -| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | -| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ | -| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ | +| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU | +|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------| +| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | +| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | +| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ✅ | +| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +!!! note + For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 74f005c496ee..7b5287bad3bb 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -43,24 +43,27 @@ th:not(:first-child) { } -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | Google TPU | -|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. - ❌ indicates that the quantization method is not supported on the specified hardware. +!!! note + For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. + !!! note This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md deleted file mode 100644 index 7b0a5ba6e72d..000000000000 --- a/docs/models/hardware_supported_models/tpu.md +++ /dev/null @@ -1,34 +0,0 @@ -# TPU - -## Supported Models - -### Text-only Language Models - -| Model | Architecture | Supported | -|-----------------------------------------------------|--------------------------------|-----------| -| mistralai/Mixtral-8x7B-Instruct-v0.1 | MixtralForCausalLM | 🟨 | -| mistralai/Mistral-Small-24B-Instruct-2501 | MistralForCausalLM | ✅ | -| mistralai/Codestral-22B-v0.1 | MistralForCausalLM | ✅ | -| mistralai/Mixtral-8x22B-Instruct-v0.1 | MixtralForCausalLM | ❌ | -| meta-llama/Llama-3.3-70B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-3.1-70B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-4-* | Llama4ForConditionalGeneration | ❌ | -| microsoft/Phi-3-mini-128k-instruct | Phi3ForCausalLM | 🟨 | -| microsoft/phi-4 | Phi3ForCausalLM | ❌ | -| google/gemma-3-27b-it | Gemma3ForConditionalGeneration | 🟨 | -| google/gemma-3-4b-it | Gemma3ForConditionalGeneration | ❌ | -| deepseek-ai/DeepSeek-R1 | DeepseekV3ForCausalLM | ❌ | -| deepseek-ai/DeepSeek-V3 | DeepseekV3ForCausalLM | ❌ | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ | -| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ | -| Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ | -| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | -| Qwen/Qwen2.5-7B-Instruct | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-32B | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-14B-Instruct | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-1.5B-Instruct | Qwen2ForCausalLM | 🟨 | - -✅ Runs and optimized. -🟨 Runs and correct but not optimized to green yet. -❌ Does not pass accuracy test or does not run. diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md deleted file mode 100644 index 8c9c1c92b676..000000000000 --- a/examples/offline_inference/profiling_tpu/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# vLLM TPU Profiling - -This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes. - -Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes. - -We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html). - -> In all examples below, we run several warmups before (so `--enforce-eager` is okay) - -## Profile Examples - -### Generate Prefill Trace - -This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations. - -```bash -export XLA_HLO_DEBUG=1 -export MODEL=Qwen/Qwen2.5-7B-Instruct -export VLLM_TPU_PROFILE_DURATION_MS=3000 -export VLLM_TPU_PROFILE_DELAY_MS=0 - -python3 profiling.py \ - --model $MODEL \ - --input-len 1024 --output-len 1 \ - --batch-size 1 --enforce-eager \ - --max-model-len 2048 \ - --tensor-parallel-size 1 \ - --profile-result-dir profiles -``` - -### Generate Decode Trace - -This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill). - -```bash -export XLA_HLO_DEBUG=1 -export MODEL=meta-llama/Llama-3.1-70B-Instruct -export VLLM_TPU_PROFILE_DURATION_MS=2000 -export VLLM_TPU_PROFILE_DELAY_MS=1000 - -rm -rf ~/.cache/vllm/xla_cache -python3 profiling.py \ - --model $MODEL \ - --input-len 1 \ - --output-len 128 \ - --batch-size 32 \ - --enforce-eager \ - --profile-result-dir profiles \ - --max-model-len 2048 --tensor-parallel-size 8 -``` - -## Visualizing the profiles - -Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm). - -Here are most likely the dependencies you need to install: - -```bash -pip install tensorflow-cpu \ - tensorboard-plugin-profile \ - etils \ - importlib_resources -``` - -Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: - -```bash -tensorboard --logdir profiles/ --port 6006 -``` diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py deleted file mode 100644 index 3b127e4fd29d..000000000000 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import dataclasses -import os -import time - -import numpy as np -import torch_xla.debug.profiler as xp -from tqdm import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.utils.argparse_utils import FlexibleArgumentParser - -DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) -DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) - - -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**dataclasses.asdict(engine_args)) - server = xp.start_server(9012) # noqa: F841 - - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=args.output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def run_to_completion(): - start_time = time.perf_counter() - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - # Warmup - print("Warming up...") - warmup_latencies = [] - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - warmup_latencies.append(run_to_completion()) - print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s") - - # Profile - profile_dir = args.profile_result_dir - print(f"Profiling (results will be saved to '{profile_dir}')...") - # Enable tracing on server - xp.trace_detached( - "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS - ) - if DELAY_MS == 0: - time.sleep(1.0) - profile_latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profile iterations"): - profile_latencies.append(run_to_completion()) - print(f"Average profile latency: {np.mean(profile_latencies):.4f}s") - - return - - -def parse_args(): - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--num-iters-warmup", - type=int, - default=5, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", - type=int, - default=1, - help="Number of iterations to run for profiling.", - ) - parser.add_argument( - "--profile-result-dir", - type=str, - default="profiles", - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard " - "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)." - ), - ) - - parser = EngineArgs.add_cli_args(parser) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py deleted file mode 100644 index 0093b63b0b1f..000000000000 --- a/examples/offline_inference/tpu.py +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import os - -from vllm import LLM, SamplingParams - -prompts = [ - "A robot may not injure a human being", - "It is only with the heart that one can see rightly;", - "The greatest glory in living lies not in never falling,", -] -answers = [ - " or, through inaction, allow a human being to come to harm.", - " what is essential is invisible to the eye.", - " but in rising every time we fall.", -] -N = 1 -# Currently, top-p sampling is disabled. `top_p` should be 1.0. -sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) - - -def main(): - parser = argparse.ArgumentParser(description="TPU offline inference example") - parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode") - args = parser.parse_args() - - llm_args = { - "model": "Qwen/Qwen2-1.5B-Instruct", - "max_num_batched_tokens": 64, - "max_num_seqs": 4, - "max_model_len": 128, - } - if args.use_spmd: - os.environ["VLLM_XLA_USE_SPMD"] = "1" - # Can only hardcode the number of chips for now. - # calling xr.global_runtime_device_count() beforeing init SPMD env in - # torch_xla will mess up the distributed env. - llm_args["tensor_parallel_size"] = 8 - # Use Llama, for num_kv_heads = 8. - llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" - - # Set `enforce_eager=True` to avoid ahead-of-time compilation. - # In real workloads, `enforce_eager` should be `False`. - llm = LLM(**llm_args) - outputs = llm.generate(prompts, sampling_params) - print("-" * 50) - for output, answer in zip(outputs, answers): - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - assert generated_text.startswith(answer) - print("-" * 50) - - -if __name__ == "__main__": - main() From 986ab5db6325fb4a5d937084ca7921a95641504a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 20 Nov 2025 19:42:33 -0500 Subject: [PATCH 258/578] [CI Bugfix] Fix Kernels DeepGEMM Test (H100) (#29106) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 71249a9543c7..6169b279dc8a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -554,7 +554,6 @@ steps: timeout_in_minutes: 45 gpu: h100 num_gpus: 1 - optional: true source_file_dependencies: - tools/install_deepgemm.sh - vllm/utils/deep_gemm.py @@ -565,10 +564,10 @@ steps: - tests/kernels/moe/test_batched_deepgemm.py - tests/kernels/attention/test_deepgemm_attention.py commands: - - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s tests/kernels/moe/test_deepgemm.py - - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py - - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py - label: Model Executor Test # 23min timeout_in_minutes: 35 From 87cbbdff639f96766d4f6604cc970394c550dc5b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 20 Nov 2025 20:16:52 -0500 Subject: [PATCH 259/578] Update model references for OLMo3 (#29099) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f0531ced0aaa..626904a97415 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -424,7 +424,7 @@ th { | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | -| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 1999e3cd2de2..b088e16756d7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -370,7 +370,7 @@ def check_available_online( ), "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"), - "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"), + "Olmo3ForCausalLM": _HfExamplesInfo("allenai/Olmo-3-7B-Instruct"), "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), "OpenPanguMTPModel": _HfExamplesInfo( "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1", From df44df01431e8af444222addddd2789c0483d70a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:41:49 -0500 Subject: [PATCH 260/578] [Feature] Shared Experts Overlap with FI deepgemm swap kernel, 2.2% throughput improvement and 3.6% TTFT improvement (#28879) Signed-off-by: yewentao256 --- .../fused_moe/fused_moe_modular_method.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 69 +++++++++------- .../layers/fused_moe/modular_kernel.py | 79 +++++++++++++++++-- .../layers/fused_moe/prepare_finalize.py | 3 +- 4 files changed, 119 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 43974ba917e4..c6dc95acdb63 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -50,6 +50,7 @@ def make( prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), shared_experts, + getattr(moe_layer, "shared_experts_stream", None), ), ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d9525a7439c3..b2f554efd8a6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -850,6 +850,45 @@ def update_expert_map(self): dp_size=get_dp_group().world_size, ) + def _maybe_setup_shared_experts_stream( + self, + hidden_states: torch.Tensor, + has_separate_shared_experts: bool, + use_chunked_impl: bool, + ) -> tuple[bool, torch.Tensor | None]: + use_shared_experts_stream = ( + has_separate_shared_experts + and not use_chunked_impl + and self.shared_experts_stream is not None + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + hidden_states_clone: torch.Tensor | None = None + if use_shared_experts_stream: + assert self.shared_experts_stream is not None + + # Clone BEFORE switching streams to avoid race condition + # where routed_expert kernel may mutate hidden_states. + hidden_states_clone = hidden_states.clone() + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We dont need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + hidden_states_clone.record_stream(self.shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + assert self.shared_experts_stream is not None + self.shared_experts_stream.wait_stream(current_stream()) + + return use_shared_experts_stream, hidden_states_clone + def _load_per_tensor_weight_scale( self, shard_id: str, @@ -1819,36 +1858,12 @@ def forward_impl( use_chunked_impl = self.use_dp_chunking - use_shared_experts_stream = ( - has_separate_shared_experts - and not use_chunked_impl - and self.shared_experts_stream is not None - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + use_shared_experts_stream, hidden_states_clone = ( + self._maybe_setup_shared_experts_stream( + hidden_states, has_separate_shared_experts, use_chunked_impl ) ) - if use_shared_experts_stream: - assert self.shared_experts_stream is not None - - # Clone BEFORE switching streams to avoid race condition - # where routed_expert kernel may mutate hidden_states. - hidden_states_clone = hidden_states.clone() - - # Record that the clone will be used by shared_experts_stream - # to avoid gc issue from deallocation of hidden_states_clone - # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We dont need shared_output.record_stream(current_stream()) - # because we synch the streams before using shared_output. - hidden_states_clone.record_stream(self.shared_experts_stream) - - # Mark sync start point for the separate shared experts - # stream here since we want to run in parallel with the - # router/gate (next op below) - assert self.shared_experts_stream is not None - self.shared_experts_stream.wait_stream(current_stream()) - # If router/gate provided, then apply it here. # (Note: This code runs only when "overlapped mode" is on to allow # parallel execution of shared experts with the FusedMoE via diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 093affe51f50..4af7af9257df 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -16,6 +16,7 @@ count_expert_num_tokens, disable_inplace, ) +from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, @@ -709,11 +710,13 @@ def __init__( prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEPermuteExpertsUnpermute, shared_experts: torch.nn.Module | None = None, + shared_experts_stream: torch.cuda.Stream | None = None, ): super().__init__() self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts self.shared_experts = shared_experts + self.shared_experts_stream = shared_experts_stream self._post_init_setup() assert ( @@ -890,6 +893,34 @@ def _slice_expert_tokens_metadata( expert_num_tokens_cpu=c_expert_num_tokens_cpu, ) + def _maybe_setup_shared_experts_stream( + self, hidden_states: torch.Tensor + ) -> tuple[bool, torch.Tensor | None]: + # decide whether to run shared experts on a separate CUDA stream to + # overlap with the main fused MoE kernel. + use_shared_experts_stream = ( + self.shared_experts is not None + and self.shared_experts_stream is not None + and hidden_states.is_cuda + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + hidden_states_clone: torch.Tensor | None = None + if use_shared_experts_stream and self.shared_experts_stream is not None: + # TODO: Optimize this (complicated) + # Note: this clone adds overhead but is required + # for correctness with multiple CUDA streams and CUDA graph capture. + hidden_states_clone = hidden_states.clone() + # record that the clone will be used by the separate stream so its + # lifetime is correctly tracked. + hidden_states_clone.record_stream(self.shared_experts_stream) + self.shared_experts_stream.wait_stream(torch.cuda.current_stream()) + + return use_shared_experts_stream, hidden_states_clone + def _prepare( self, hidden_states: torch.Tensor, @@ -1077,12 +1108,30 @@ def _finalize( topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, + hidden_states_clone: torch.Tensor | None = None, + use_shared_experts_stream: bool = False, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ The _finalize method is a wrapper around self.prepare_finalize.finalize that handles DBO, async and shared expert overlap. """ - shared_output: torch.Tensor | None = None + + def maybe_run_shared_experts() -> torch.Tensor | None: + if self.shared_experts is None: + return None + + if ( + not use_shared_experts_stream + or self.shared_experts_stream is not None + and (not hidden_states.is_cuda or not torch.cuda.is_available()) + ): + # fall back to running on the current stream + return self.shared_experts(hidden_states) + + assert hidden_states_clone is not None + # launch shared experts on the dedicated stream. + with torch.cuda.stream(self.shared_experts_stream): + return self.shared_experts(hidden_states_clone) if not self.prepare_finalize.supports_async(): assert not dbo_enabled() @@ -1095,8 +1144,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = maybe_run_shared_experts() else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1107,8 +1155,7 @@ def _finalize( self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = maybe_run_shared_experts() # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1131,12 +1178,28 @@ def _finalize( receiver() + self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream) + if self.shared_experts is None: return output else: assert shared_output is not None return shared_output, output + def _wait_for_shared_experts_stream( + self, hidden_states: torch.Tensor, use_shared_experts_stream: bool + ) -> None: + # ensure that any work enqueued on the shared_experts_stream is + # completed before the shared_output tensor is consumed + if ( + self.shared_experts is not None + and use_shared_experts_stream + and self.shared_experts_stream is not None + and hidden_states.is_cuda + and current_platform.is_cuda() + ): + torch.cuda.current_stream().wait_stream(self.shared_experts_stream) + def forward( self, hidden_states: torch.Tensor, @@ -1183,6 +1246,10 @@ def forward( else: output = torch.zeros_like(hidden_states) + use_shared_experts_stream, hidden_states_clone = ( + self._maybe_setup_shared_experts_stream(hidden_states) + ) + local_num_experts = w1.size(0) if global_num_experts == -1: global_num_experts = local_num_experts @@ -1219,4 +1286,6 @@ def forward( topk_weights, topk_ids, apply_router_weight_on_input, + hidden_states_clone=hidden_states_clone, + use_shared_experts_stream=use_shared_experts_stream, ) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 9bb976fb9ec9..e27e2eb32da0 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -45,7 +45,8 @@ def prepare( assert topk == 1, ( "apply_router_weight_on_input is only implemented for topk=1" ) - a1.mul_(topk_weights.to(a1.dtype)) + # Note: do not use inplace for shared experts overlap + a1 = a1 * topk_weights.to(a1.dtype) a1q, a1q_scale = moe_kernel_quantize_input( a1, From 9875be6431872b513a8554c518e48ad79eba4656 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 21 Nov 2025 09:46:43 +0800 Subject: [PATCH 261/578] [LoRA][2/2]Remove LoRA extra vocab (#28545) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 10 + tests/lora/test_layers.py | 189 ++----------------- tests/lora/test_llama_tp.py | 84 +++++---- tests/lora/test_lora_functions.py | 4 +- tests/lora/test_lora_manager.py | 20 +- tests/lora/test_worker.py | 8 +- tests/lora/utils.py | 8 - vllm/config/lora.py | 18 +- vllm/engine/arg_utils.py | 5 - vllm/lora/layers/base.py | 1 - vllm/lora/layers/base_linear.py | 1 - vllm/lora/layers/column_parallel_linear.py | 1 - vllm/lora/layers/fused_moe.py | 2 - vllm/lora/layers/logits_processor.py | 55 +----- vllm/lora/layers/vocal_parallel_embedding.py | 33 +--- vllm/lora/lora_weights.py | 24 --- vllm/lora/models.py | 54 +----- vllm/lora/punica_wrapper/punica_base.py | 11 +- vllm/lora/punica_wrapper/punica_gpu.py | 5 +- vllm/lora/punica_wrapper/punica_tpu.py | 3 +- vllm/lora/punica_wrapper/punica_xpu.py | 5 +- vllm/lora/utils.py | 10 + vllm/lora/worker_manager.py | 9 +- vllm/model_executor/models/granite.py | 34 +--- vllm/model_executor/models/llama.py | 30 +-- vllm/model_executor/models/mixtral.py | 32 +--- vllm/model_executor/models/teleflm.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 3 - 28 files changed, 133 insertions(+), 528 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index d8ff9339bb49..9d38ec542279 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -250,6 +250,16 @@ def olmoe_lora_files(): return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider") +@pytest.fixture(scope="session") +def qwen3_lora_files(): + return snapshot_download(repo_id="charent/self_cognition_Alice") + + +@pytest.fixture(scope="session") +def llama32_lora_files(): + return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider") + + @pytest.fixture def reset_default_device(): """ diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 8f18f0144193..9df3a07a9e5e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -136,7 +136,6 @@ def populate_loras( id_to_index: list[int | None], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, - generate_embeddings_tensor: int = 0, repeats: int = 1, ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]: """This method populates the lora layers with lora weights. @@ -148,8 +147,6 @@ def populate_loras( layer: the LoRAlayer to populate. layer_weights: the PyTorch tensor containing the layer's weights. - generate_embeddings_tensor: whether to generate an - embeddings tensor for each LoRA. repeats: must only be set for column parallel packed layers. Indicates the number of loras to compose together to create a single lora layer. @@ -171,7 +168,6 @@ def populate_loras( sublora = DummyLoRAManager(layer_weights.device).init_random_lora( module_name=f"fake_{i}", weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, ) sublora.lora_b = sublora.lora_b[ (sublora_len * i) : (sublora_len * (i + 1)), : @@ -185,7 +181,6 @@ def populate_loras( slot_idx, lora_a=lora.lora_a, lora_b=lora.lora_b, - embeddings_tensor=lora.embeddings_tensor, ) lora_dict[lora_id] = lora @@ -306,7 +301,6 @@ def create_random_embedding_layer(): id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_embedding(torch.cat(inputs)) @@ -344,7 +338,6 @@ def create_random_embedding_layer(): id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_embedding(torch.cat(inputs)) @@ -354,149 +347,6 @@ def create_random_embedding_layer(): torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) -@torch.inference_mode() -# @pytest.mark.skip( -# reason="Fails when loras are in any slot other than the first.") -@pytest.mark.parametrize("num_loras", [1, 2, 4]) -@pytest.mark.parametrize("device", DEVICES) -@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) -@pytest.mark.parametrize("stage", STAGES) -def test_embeddings_with_new_embeddings( - dist_init, num_loras, device, vocab_size, stage -) -> None: - if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) - - torch.set_default_device(device) - max_loras = 8 - punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) - assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig( - max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16 - ) - - def create_random_embedding_layer(): - embedding = VocabParallelEmbedding(vocab_size, 256) - embedding_data = torch.rand_like(embedding.weight.data) - embedding.weight.data = embedding_data - embedding.weight.data[vocab_size:, :] = 0 - expanded_embedding = VocabParallelEmbedding( - vocab_size + lora_config.lora_extra_vocab_size * max_loras, - 256, - org_num_embeddings=vocab_size, - ) - expanded_embedding.weight.data[:vocab_size, :] = embedding_data - # We need to deepcopy the embedding as it will be modified - # in place - lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding)) - lora_embedding.create_lora_weights(max_loras, lora_config) - - return expanded_embedding, lora_embedding - - for i in range(NUM_RANDOM_SEEDS): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - expanded_embedding, lora_embedding = create_random_embedding_layer() - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_embedding, - layer_weights=torch.zeros( - (256, vocab_size + lora_config.lora_extra_vocab_size) - ), - generate_embeddings_tensor=256, - ) - - lora_embedding.set_mapping(punica_wrapper) - # All embeddings tensors have the same shape. - embeddings_tensors = [ - lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) - ] - embeddings_tensor_len = embeddings_tensors[0].shape[0] - - # Add empty embeddings_tensors for unoccupied lora slots. - for _ in range(max_loras - len(embeddings_tensors)): - embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape)) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=num_loras * 3, - input_size=(200,), - input_range=(1, vocab_size), - device=device, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - vocab_size, - lora_config.lora_extra_vocab_size, - ) - original_inputs = deepcopy(inputs) - - # Force some of the inputs to be in the extended embeddings range - # to guarantee that their behavior is tested. - for input_, original_input_, lora_id in zip( - inputs, original_inputs, prompt_mapping - ): - embedding_id = lora_id - 1 - input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len) - original_input_[-1] = vocab_size - input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1) - original_input_[-2] = vocab_size + embeddings_tensor_len - 1 - - expanded_embedding.weight[ - vocab_size : vocab_size + (embeddings_tensor_len * max_loras) - ] = torch.cat(embeddings_tensors) - - lora_result = lora_embedding(torch.cat(original_inputs)) - - expected_results: list[torch.Tensor] = [] - for input_, original_input_, lora_id in zip( - inputs, original_inputs, prompt_mapping - ): - lora = lora_dict[lora_id] - result = expanded_embedding(input_) - after_a = F.embedding( - original_input_, - lora.lora_a.T, - ) - result += after_a @ lora.lora_b.T - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_embedding.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=num_loras * 3, - input_size=(200,), - input_range=(1, vocab_size), - device=device, - ) - original_inputs = deepcopy(inputs) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - vocab_size, - lora_config.lora_extra_vocab_size, - ) - lora_result = lora_embedding(torch.cat(original_inputs)) - expected_result = expanded_embedding(torch.cat(inputs)) - - rtol, atol = TOLERANCES[lora_result.dtype] - torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) - - @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @@ -518,16 +368,13 @@ def test_lm_head_logits_processor( def _pretest(): linear = ParallelLMHead( - vocab_size + lora_config.lora_extra_vocab_size, - 1024, - vocab_size, + num_embeddings=vocab_size, + embedding_dim=1024, params_dtype=torch.float16, ) linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data[:, vocab_size:] = 0 - logits_processor = LogitsProcessor( - vocab_size + lora_config.lora_extra_vocab_size, vocab_size - ) + logits_processor = LogitsProcessor(vocab_size) lora_logits_processor = LogitsProcessorWithLoRA( logits_processor, 1024, linear.weight.dtype, linear.weight.device, None ) @@ -541,15 +388,12 @@ def _pretest(): id_to_index = get_random_id_to_index(num_loras, max_loras) linear, logits_processor, lora_logits_processor = _pretest() lora_logits_processor.set_mapping(punica_wrapper) - # NOTE: all the generated loras share the same embeddings tensor. + lora_dict, _ = populate_loras( id_to_index, layer=lora_logits_processor, layer_weights=linear.weight, - generate_embeddings_tensor=1024, ) - embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor - embeddings_tensor_len = embeddings_tensor.shape[0] inputs, index_mapping, prompt_mapping = create_random_inputs( active_lora_ids=list(lora_dict.keys()), @@ -565,7 +409,6 @@ def _pretest(): id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) input_ = torch.rand(20, 1024) @@ -575,23 +418,16 @@ def _pretest(): original_lm_head = deepcopy(linear) - linear.weight[ - logits_processor.org_vocab_size : logits_processor.org_vocab_size - + embeddings_tensor_len - ] = embeddings_tensor - - logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits( hidden_states=input_, lm_head=linear, embedding_bias=None ) - result[:, vocab_size + embeddings_tensor_len :] = float("-inf") + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) - logits_processor.org_vocab_size = vocab_size # Check that resetting the lora weights succeeds @@ -612,7 +448,6 @@ def _pretest(): id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_logits_processor._get_logits( @@ -694,7 +529,6 @@ def create_random_linear_replicated_layer(): id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -726,7 +560,10 @@ def create_random_linear_replicated_layer(): lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) punica_wrapper.update_metadata( - lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size + lora_mapping, + id_to_index, + max_loras, + 512, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -817,7 +654,6 @@ def create_random_linear_parallel_layer(): id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -849,7 +685,10 @@ def create_random_linear_parallel_layer(): lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) punica_wrapper.update_metadata( - lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size + lora_mapping, + id_to_index, + max_loras, + 512, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -963,7 +802,6 @@ class FakeConfig: id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -1000,7 +838,6 @@ class FakeConfig: id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 7bbd1e364d19..18704fa6e45d 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -13,17 +13,27 @@ from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test -MODEL_PATH = "meta-llama/Llama-2-7b-hf" +PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|> +I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. +" +##Instruction: +candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key. +Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key. +The People_ID of candidate is the foreign key of People_ID of people. +###Input: +{context} +###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|> +""" # noqa: E501 EXPECTED_LORA_OUTPUT = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501 + "SELECT count(*) FROM candidate", + "SELECT count(*) FROM candidate", + "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501 + "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501 ] +MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct" + def do_sample( llm: vllm.LLM, @@ -32,18 +42,19 @@ def do_sample( tensorizer_config_dict: dict | None = None, ) -> list[str]: prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]", # noqa: E501 + PROMPT_TEMPLATE.format(context="How many candidates are there?"), + PROMPT_TEMPLATE.format(context="Count the number of candidates."), + PROMPT_TEMPLATE.format( + context="Which poll resource provided the most number of candidate information?" # noqa: E501 + ), + PROMPT_TEMPLATE.format( + context="Return the poll resource associated with the most candidates." + ), ] sampling_params = vllm.SamplingParams( - temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"] + temperature=0, max_tokens=64, stop=["<|im_end|>"] ) - if tensorizer_config_dict is not None: outputs = llm.generate( prompts, @@ -75,13 +86,15 @@ def do_sample( return generated_texts -def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None): +def generate_and_test( + llm, llama32_lora_files, tensorizer_config_dict: dict | None = None +): print("lora adapter created") print("lora 1") assert ( do_sample( llm, - sql_lora_files, + llama32_lora_files, tensorizer_config_dict=tensorizer_config_dict, lora_id=1, ) @@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = assert ( do_sample( llm, - sql_lora_files, + llama32_lora_files, tensorizer_config_dict=tensorizer_config_dict, lora_id=2, ) @@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = @create_new_process_for_each_test() @pytest.mark.parametrize("cudagraph_specialize_lora", [True, False]) -def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool): +def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, # also test odd max_num_seqs - max_num_seqs=13, + max_num_seqs=7, + max_model_len=1024, max_loras=4, compilation_config=vllm.config.CompilationConfig( cudagraph_specialize_lora=cudagraph_specialize_lora, ), ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=4) -def test_llama_lora_tp4(sql_lora_files): +def test_llama_lora_tp4(llama32_lora_files): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, - max_num_seqs=16, + max_num_seqs=7, + max_model_len=1024, max_loras=4, tensor_parallel_size=4, ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=4) -def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): +def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, - max_num_seqs=16, + max_num_seqs=8, max_loras=4, + max_model_len=1024, tensor_parallel_size=4, fully_sharded_loras=True, ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=2) def test_tp2_serialize_and_deserialize_lora( - tmp_path, sql_lora_files, sql_lora_huggingface_id + tmp_path, + llama32_lora_files, ): # Run the tensorizing of the LoRA adapter and the model in a subprocess # to guarantee cleanup @@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora( model_name = "model-rank-%03d.tensors" model_ref = MODEL_PATH - lora_path = sql_lora_huggingface_id + lora_path = llama32_lora_files suffix = "test" try: result = subprocess.run( @@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora( loaded_llm = LLM( model=model_ref, - tokenizer=sql_lora_files, load_format="tensorizer", enable_lora=True, enforce_eager=True, model_loader_extra_config=tensorizer_config, - max_num_seqs=13, + max_num_seqs=7, + max_model_len=1024, tensor_parallel_size=2, max_loras=2, ) @@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora( print("lora 1") assert ( do_sample( - loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1 + loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1 ) == EXPECTED_LORA_OUTPUT ) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index e914393fee8a..1c692630284d 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -13,8 +13,8 @@ from vllm.lora.request import LoRARequest from vllm.v1.engine.llm_engine import LLMEngine -MODEL_PATH = "meta-llama/Llama-2-7b-hf" -LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" +MODEL_PATH = "Qwen/Qwen3-0.6B" +LORA_MODULE_PATH = "charent/self_cognition_Alice" LORA_RANK = 8 diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index e7816031142e..24d4dfca46d6 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -48,9 +48,6 @@ @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors")) - new_embeddings = load_file( - os.path.join(sql_lora_files, "new_embeddings.safetensors") - ) peft_helper = PEFTHelper.from_local_dir( sql_lora_files, max_position_embeddings=4096 @@ -60,7 +57,6 @@ def test_from_lora_tensors(sql_lora_files, device): tensors, peft_helper=peft_helper, device=device, - embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES, ) @@ -76,18 +72,6 @@ def test_from_lora_tensors(sql_lora_files, device): f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" ) assert lora.lora_a.shape[0] == 8 - embeddings_module = next( - (k for k in EMBEDDING_MODULES if k in module_name), None - ) - if embeddings_module: - assert torch.equal( - lora.embeddings_tensor, - new_embeddings[EMBEDDING_MODULES[embeddings_module]].to( - device=lora.embeddings_tensor.device - ), - ) - else: - assert lora.embeddings_tensor is None def create_lora( @@ -552,9 +536,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path worker_adapter_manager = WorkerLoRAManager( vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES ) - worker_adapter_manager.vocab_size = ( - dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size - ) + worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size worker_adapter_manager.create_lora_manager(dummy_model_gate_up) dummy_lora_files = f"{tmp_path}/lora_adapter" diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index c97f8debd1b9..b163559a9414 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -20,11 +20,12 @@ from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker +MODEL_PATH = "Qwen/Qwen3-0.6B" NUM_LORAS = 16 @patch.dict(os.environ, {"RANK": "0"}) -def test_worker_apply_lora(sql_lora_files): +def test_worker_apply_lora(qwen3_lora_files): def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): lora_mapping = LoRAMapping([], []) @@ -34,9 +35,10 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): vllm_config = VllmConfig( model_config=ModelConfig( - "meta-llama/Llama-2-7b-hf", + MODEL_PATH, seed=0, dtype="float16", + max_model_len=127, enforce_eager=True, ), load_config=LoadConfig( @@ -73,7 +75,7 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): assert worker.list_loras() == set() lora_requests = [ - LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS) + LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS) ] set_active_loras(worker, lora_requests) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index d30b77f09466..6aba5299b582 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -28,7 +28,6 @@ def init_random_lora( module_name: str, weight: torch.Tensor, rank: int = 8, - generate_embeddings_tensor: int = 0, ): lora = LoRALayerWeights( module_name, @@ -41,13 +40,6 @@ def init_random_lora( [weight.shape[0], rank], dtype=weight.dtype, device=self._device ), ) - if generate_embeddings_tensor: - lora.embeddings_tensor = torch.rand( - 5, - generate_embeddings_tensor, - dtype=weight.dtype, - device=self._device, - ) self.set_module_lora(module_name, lora) return lora diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 84e92eef4007..072e0ec2104f 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from typing import TYPE_CHECKING, Any, ClassVar, Literal +from typing import TYPE_CHECKING, Any, Literal import torch from pydantic import ConfigDict, Field, model_validator @@ -11,7 +11,6 @@ from vllm.config.utils import config from vllm.logger import init_logger -from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.config import ModelConfig @@ -46,19 +45,6 @@ class LoRAConfig: `max_loras`.""" lora_dtype: torch.dtype | LoRADType = "auto" """Data type for LoRA. If auto, will default to base model dtype.""" - lora_extra_vocab_size: LoRAExtraVocabSize = Field( - default=256, - deprecated=( - "`lora_extra_vocab_size` is deprecated and will be removed " - "in v0.12.0. Additional vocabulary support for " - "LoRA adapters is being phased out." - ), - ) - """(Deprecated) Maximum size of extra vocabulary that can be present in a - LoRA adapter. Will be removed in v0.12.0.""" - lora_vocab_padding_size: ClassVar[int] = ( - current_platform.get_lora_vocab_padding_size() - ) default_mm_loras: dict[str, str] | None = None """Dictionary mapping specific modalities to LoRA model paths; this field is only applicable to multimodal models and should be leveraged when a @@ -87,8 +73,6 @@ def compute_hash(self) -> str: factors.append(self.max_loras) factors.append(self.fully_sharded_loras) factors.append(self.lora_dtype) - factors.append(self.lora_extra_vocab_size) - factors.append(self.lora_vocab_padding_size) hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 74828bc109cb..bcb90119f9b0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -484,7 +484,6 @@ class EngineArgs: fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype - lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override @@ -1011,9 +1010,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: ) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) - lora_group.add_argument( - "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"] - ) lora_group.add_argument( "--lora-dtype", **lora_kwargs["lora_dtype"], @@ -1680,7 +1676,6 @@ def create_engine_config( max_loras=self.max_loras, default_mm_loras=self.default_mm_loras, fully_sharded_loras=self.fully_sharded_loras, - lora_extra_vocab_size=self.lora_extra_vocab_size, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 0c7e80684889..62326c05b2bd 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -44,7 +44,6 @@ def set_lora( index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): """Overwrites lora tensors at index.""" ... diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 3db4165e2017..e85c5bd70b07 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -96,7 +96,6 @@ def set_lora( index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): # Except for QKVParallelLinearWithLoRA and # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 637ded9b2a0f..273c4950e323 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -248,7 +248,6 @@ def set_lora( index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 3291c41fcda1..adf30855cafc 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -406,8 +406,6 @@ def set_lora( index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, - bias: torch.Tensor | None = None, ): """Overwrites lora tensors at index.""" self.reset_lora(index) diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index adc5e861f57f..06f92652031e 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math import torch import torch.nn as nn @@ -108,22 +107,13 @@ def create_lora_weights( ( max_loras, 1, - # Pad for kernel compatibility - math.ceil( - self.base_layer.vocab_size / lora_config.lora_vocab_padding_size - ) - * lora_config.lora_vocab_padding_size, + self.base_layer.vocab_size, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, device=self.device, ) - self.embeddings_tensors = torch.full( - (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), - fill_value=float("-inf"), - dtype=self.dtype, - device=self.device, - ) + if self.sharded_to_full_mapping is not None: self.sharded_to_full_mapping_gpu = torch.tensor( self.sharded_to_full_mapping, device=self.device, dtype=torch.long @@ -134,14 +124,12 @@ def create_lora_weights( def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = float("-inf") def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_( @@ -150,12 +138,6 @@ def set_lora( self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( lora_b, non_blocking=True ) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - : embeddings_tensor.shape[0], - : embeddings_tensor.shape[1], - ] = embeddings_tensor def _get_logits( self, @@ -193,39 +175,6 @@ def _get_logits( # token_id: [0, 1, 2, 3, 4, 5, -1, -1] logits = logits[:, self.sharded_to_full_mapping_gpu] - lora_logits = torch.empty( - self.embeddings_tensors.shape[0] + 1, - self.embeddings_tensors.shape[1], - hidden_states.shape[0], - dtype=self.embeddings_tensors.dtype, - device=self.embeddings_tensors.device, - ) - torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1]) - - neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype) - - lora_logits[-1] = neg_inf - lora_logits = lora_logits.mT - indices_padded = self.punica_wrapper.sampler_indices_padded - - if current_platform.is_tpu() or current_platform.is_xpu(): - indices_padded = indices_padded[: logits.size(0)] - - lora_logits = ( - lora_logits.reshape( - lora_logits.shape[0] * lora_logits.shape[1], - lora_logits.shape[2], - ) - .index_select(0, indices_padded) - .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf) - ) - - logits[ - :, - self.base_layer.org_vocab_size : self.base_layer.org_vocab_size - + lora_logits.shape[1], - ] = lora_logits - lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits( logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0 ) diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index ca4ad8012e9c..5b1f7886bc23 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -46,19 +46,10 @@ def create_lora_weights( self.embeddings_slice = None self.embeddings_weights = None - self.embeddings_tensors = torch.zeros( - ( - max_loras, - lora_config.lora_extra_vocab_size, - self.base_layer.embedding_dim, - ), - dtype=self.base_layer.weight.dtype, - device=self.base_layer.weight.device, - ) self.lora_a_stacked = torch.zeros( ( max_loras, - self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size, + self.base_layer.org_vocab_size, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -82,14 +73,12 @@ def create_lora_weights( def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = 0 def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, @@ -100,36 +89,18 @@ def set_lora( self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( lora_b, non_blocking=True ) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - : embeddings_tensor.shape[0], - : embeddings_tensor.shape[1], - ].copy_(embeddings_tensor, non_blocking=True) - if self.embeddings_slice is not None: - # TODO(yard1): Optimize this copy, we don't need to copy - # everything, just the modified part - embeddings = self.embeddings_tensors.view( - self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1], - self.embeddings_tensors.shape[2], - )[self.embeddings_slice[0] : self.embeddings_slice[1]] - assert self.embeddings_weights is not None - self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings) def forward(self, x: torch.Tensor) -> torch.Tensor: - added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0) - # NB: Don't use torch.narrow here. torch.narrow triggers some # Dynamic Shape specialization in torch.compile num_tokens = x.shape[0] indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens] - indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens] full_lora_a_embeddings = F.embedding( x + indices_1, self.lora_a_stacked_2d, ) - full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask)) + full_output = self.base_layer.forward(x) full_output_org = full_output if full_output.ndim == 3: diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 7691481d5039..f0d8e2219405 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -21,7 +21,6 @@ def __init__( lora_alpha: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None = None, scaling: float | None = None, ) -> None: self.module_name = module_name @@ -29,7 +28,6 @@ def __init__( self.lora_alpha = lora_alpha self.lora_a = lora_a self.lora_b = lora_b - self.embeddings_tensor = embeddings_tensor if scaling is None: self.scaling = self.lora_alpha / self.rank @@ -56,18 +54,11 @@ def output_dim(self) -> int: def is_packed(self) -> bool: return False - @property - def extra_vocab_size(self) -> int: - return ( - self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0 - ) - @classmethod def from_config( cls, module_name: str, peft_helper: PEFTHelper, - embeddings_tensor: torch.Tensor | None = None, ) -> "LoRALayerWeights": # lora_a and lora_b are set to None for config-based construction return cls( @@ -76,7 +67,6 @@ def from_config( peft_helper.lora_alpha, None, None, - embeddings_tensor, peft_helper.vllm_lora_scaling_factor, ) @@ -89,7 +79,6 @@ def create_dummy_lora_weights( rank: int, dtype: torch.dtype, device: torch.types.Device, - embeddings_tensor_dim: int | None = None, ) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() lora_a = torch.zeros( @@ -99,24 +88,12 @@ def create_dummy_lora_weights( [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory ) - embeddings_tensor = ( - torch.rand( - 10, - embeddings_tensor_dim, - dtype=dtype, - device=device, - pin_memory=pin_memory, - ) - if embeddings_tensor_dim - else None - ) return cls( module_name, rank=rank, lora_alpha=1, lora_a=lora_a, lora_b=lora_b, - embeddings_tensor=embeddings_tensor, ) @@ -139,7 +116,6 @@ def __init__( lora_a=lora_a, lora_b=lora_b, scaling=scaling, # type: ignore - embeddings_tensor=None, ) self.lora_alphas = lora_alphas if scaling is None: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 02c252f15bfa..eb11cd0afc48 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -21,6 +21,7 @@ from_layer, from_layer_logits_processor, get_supported_lora_modules, + is_base_embeddding_weights, is_regex_target_modules, parse_fine_tuned_lora_name, process_packed_modules_mapping, @@ -93,14 +94,6 @@ def clone(self, lora_model_id: int) -> "LoRAModel": loras=self.loras.copy(), ) - @property - def extra_vocab_size(self) -> int: - return ( - max(lora.extra_vocab_size for lora in self.loras.values()) - if self.loras - else 0 - ) - def get_lora(self, module_name: str) -> LoRALayerWeights | None: """Get LoRA for a given module by name""" return self.loras.get(module_name, None) @@ -117,7 +110,6 @@ def from_lora_tensors( peft_helper: PEFTHelper, device: str = "cuda", dtype: torch.dtype | None = None, - embeddings: dict[str, torch.Tensor] | None = None, target_embedding_padding: int | None = None, embedding_modules: dict[str, str] | None = None, embedding_padding_modules: list[str] | None = None, @@ -127,24 +119,14 @@ def from_lora_tensors( pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): + if is_base_embeddding_weights(tensor_name): + continue module_name, is_lora_a = parse_fine_tuned_lora_name( tensor_name, weights_mapper ) if module_name not in loras: - lora_embeddings_tensor = None - if embeddings: - assert embedding_modules is not None - embeddings_module = next( - (k for k in embedding_modules if k in module_name), None - ) - if embeddings_module: - lora_embeddings_tensor = embeddings[ - embedding_modules[embeddings_module] - ].to(device=device, dtype=dtype) - if pin_memory: - lora_embeddings_tensor = lora_embeddings_tensor.pin_memory() loras[module_name] = LoRALayerWeights.from_config( - module_name, peft_helper, lora_embeddings_tensor + module_name, peft_helper ) if is_lora_a: @@ -206,15 +188,17 @@ def from_local_checkpoint( lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") - new_embeddings_tensor_path = os.path.join( - lora_dir, "new_embeddings.safetensors" - ) - new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + # new_embeddings_tensor_path = os.path.join( + # lora_dir, "new_embeddings.safetensors" + # ) + # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") tensors: dict[str, torch.Tensor] = {} unexpected_modules: list[list[str] | str] = [] def check_unexpected_modules(modules: dict): for lora_module in modules.keys(): # noqa + if is_base_embeddding_weights(lora_module): + continue module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Handle FSDP file format where experts.base_layer is the # gate_up_proj and experts is the down_proj @@ -300,21 +284,12 @@ def check_unexpected_modules(modules: dict): else: raise ValueError(f"{lora_dir} doesn't contain tensors") - embeddings = None - if os.path.isfile(new_embeddings_tensor_path): - embeddings = safetensors.torch.load_file(new_embeddings_tensor_path) - elif os.path.isfile(new_embeddings_bin_file_path): - embeddings = torch.load( - new_embeddings_bin_file_path, map_location=device, weights_only=True - ) - return cls.from_lora_tensors( lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, tensors=tensors, peft_helper=peft_helper, device=device, dtype=dtype, - embeddings=embeddings, target_embedding_padding=target_embedding_padding, embedding_modules=embedding_modules, embedding_padding_modules=embedding_padding_modules, @@ -474,7 +449,6 @@ def activate_adapter( index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor, ) else: module.reset_lora(index) @@ -505,7 +479,6 @@ def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: self.lora_index_to_id, self.lora_slots + 1, self.vocab_size, - self.lora_config.lora_extra_vocab_size, ) def remove_all_adapters(self): @@ -616,7 +589,6 @@ def create_dummy_lora( if parts[-1] in embedding_modules: input_dim = ( module.base_layer.org_vocab_size - + self.lora_config.lora_extra_vocab_size if hasattr(module.base_layer, "org_vocab_size") else module.base_layer.weight.shape[1] ) @@ -625,11 +597,6 @@ def create_dummy_lora( if hasattr(module.base_layer, "embedding_dim") else module.base_layer.weight.shape[0] ) - embeddings_tensor_dim = ( - module.base_layer.embedding_dim - if hasattr(module.base_layer, "embedding_dim") - else module.base_layer.weight.shape[1] - ) lora = LoRALayerWeights.create_dummy_lora_weights( module_name, input_dim, @@ -637,7 +604,6 @@ def create_dummy_lora( rank, module.lora_a_stacked[0].dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim, ) else: lora = LoRALayerWeights.create_dummy_lora_weights( diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index a6ffbb7b71ce..7c0fc8167711 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -31,7 +31,6 @@ def update_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ) -> None: """ @@ -172,8 +171,11 @@ def _update_base_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, ): + # NOTE We have remove lora extra vocab support for now. So we set + # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed. + + extra_vocab_size = 0 ( base_indices, sampler_indices, @@ -285,12 +287,9 @@ def update_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) if mapping.is_prefill: # Update metadata required for prefill-related operators. diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index d863a5884d3c..52138ef0cc3b 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -65,13 +65,10 @@ def update_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): self.is_prefill = mapping.is_prefill - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) # Prepare cuda kernel metadata tensors self.token_mapping_meta.prepare_tensors(self.token_lora_indices) diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 090878dcd254..0888772db54e 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -292,7 +292,6 @@ def _update_base_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, ): # Make sure we don't accidentally collect outside operations torch_xla.sync() @@ -313,7 +312,7 @@ def _update_base_metadata( lora_index_to_id, max_loras, vocab_size, - extra_vocab_size, + 0, # extra_vocab_size "cpu", ) self._token_lora_indices = self._pad_to_shape( diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py index b95087d0ff83..00c00782896c 100644 --- a/vllm/lora/punica_wrapper/punica_xpu.py +++ b/vllm/lora/punica_wrapper/punica_xpu.py @@ -43,13 +43,10 @@ def update_metadata( lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): self.is_prefill = mapping.is_prefill - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor: return torch.narrow(self._token_lora_indices, 0, 0, x.size(0)) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 0f43ff06d8f2..a49a7d9d1669 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -166,6 +166,16 @@ def parse_fine_tuned_lora_name( raise ValueError(f"{name} is unsupported LoRA weight") +def is_base_embeddding_weights(name: str) -> bool: + # hardcoded subfixes for input & output embedding weights + input_embedding_subfix = ".embed_tokens.base_layer.weight" + output_embedding_subfix = ".lm_head.base_layer.weight" + + return name.endswith(input_embedding_subfix) or name.endswith( + output_embedding_subfix + ) + + def is_regex_target_modules( load_modules: str | list[str], expected_lora_modules: list[str] ) -> bool: diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index b85151f2c759..4cc201a6414f 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -121,8 +121,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, - target_embedding_padding=self.vocab_size - + self.lora_config.lora_extra_vocab_size, + target_embedding_padding=self.vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, tensorizer_config_dict=lora_request.tensorizer_config_dict, @@ -143,12 +142,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: # For BadRequestError raise e - if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} " - f"is greater than lora_extra_vocab_size " - f"{self.lora_config.lora_extra_vocab_size}." - ) return lora def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 1dc205b47753..cd7ce2fc8f00 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -261,29 +260,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank ): self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, ) else: @@ -420,28 +406,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config - self.lora_config = lora_config + self.quant_config = quant_config self.model = GraniteModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) @@ -453,7 +429,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale /= config.logits_scaling self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size, scale=logit_scale + config.vocab_size, scale=logit_scale ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d5b49d2fb4c2..ebf8addda4a5 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -368,24 +367,18 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size + + self.vocab_size = config.vocab_size + if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank ): self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, quant_config=quant_config, ) else: @@ -562,9 +555,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config - self.lora_config = lora_config self.model = self._init_model( vllm_config=vllm_config, @@ -573,20 +564,9 @@ def __init__( ) if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=( - DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size - ), quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) @@ -595,7 +575,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size, logit_scale + config.vocab_size, scale=logit_scale ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 54ab8dd493e7..0a9c3f136964 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -301,23 +300,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config + parallel_config = vllm_config.parallel_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab + + self.vocab_size = config.vocab_size self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, ) self.enable_eplb = parallel_config.enable_eplb @@ -508,34 +502,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config + self.config = config - self.lora_config = lora_config + self.quant_config = quant_config self.model = MixtralModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size - ) + self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors ) diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py index 8a0bec9dff84..bebd7bcaa924 100644 --- a/vllm/model_executor/models/teleflm.py +++ b/vllm/model_executor/models/teleflm.py @@ -74,5 +74,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.output_mult = self.config.output_mult / self.mup_scale_factor logit_scale = self.output_mult self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, self.config.vocab_size, logit_scale + self.config.vocab_size, scale=logit_scale ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e9eb7cad38f8..923c31c187f3 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -219,9 +219,6 @@ def __init__( self.hidden_size = model_config.get_hidden_size() self.vocab_size = model_config.get_vocab_size() - if self.lora_config is not None: - self.vocab_size += self.lora_config.lora_extra_vocab_size - # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope From ed6ae1e36a03bed4a29287163e051a7772b1d8b1 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Thu, 20 Nov 2025 17:54:35 -0800 Subject: [PATCH 262/578] [AITER] [ROCm] Fix crash when loading llama4 model with old aiter version installed, fallback to forward_native implementation (#29124) Signed-off-by: Xiao Li --- vllm/v1/sample/ops/topk_topp_sampler.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index c6c7e924175f..5b2d130b0ea4 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -60,13 +60,20 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None: logprobs_mode not in ("processed_logits", "processed_logprobs") and rocm_aiter_ops.is_enabled() ): - import aiter.ops.sampling # noqa: F401 + try: + import aiter.ops.sampling # noqa: F401 - self.aiter_ops = torch.ops.aiter - logger.info_once( - "Using aiter sampler on ROCm (lazy import, sampling-only)." - ) - self.forward = self.forward_hip + self.aiter_ops = torch.ops.aiter + logger.info_once( + "Using aiter sampler on ROCm (lazy import, sampling-only)." + ) + self.forward = self.forward_hip + except ImportError: + logger.warning_once( + "aiter.ops.sampling is not available on ROCm. " + "Falling back to forward_native implementation." + ) + self.forward = self.forward_native else: self.forward = self.forward_native From e1eefa4c40fc5b28bd7e83b6596bb5d2f420fd92 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:54:59 -0500 Subject: [PATCH 263/578] [Bug] Fix torch warning of tf32 usage (#29112) Signed-off-by: yewentao256 --- vllm/model_executor/layers/batch_invariant.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 69fa6bdffd43..bec7af028634 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -852,5 +852,6 @@ def init_batch_invariance(): enable_batch_invariant_mode() # Disable TF32 for batch invariance - it causes non-deterministic rounding - torch.backends.cuda.matmul.allow_tf32 = False - torch.backends.cudnn.allow_tf32 = False + torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cudnn.conv.fp32_precision = "ieee" + torch.backends.cudnn.rnn.fp32_precision = "ieee" From 3f5f36da3fefbae96960f60d41ccf8ac1155515e Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Thu, 20 Nov 2025 22:30:07 -0500 Subject: [PATCH 264/578] [ROCm] Fix for import when building with upstream triton for gfx1100 for gpt-oss serving (#29127) Signed-off-by: Hongxia Yang --- .../layers/quantization/utils/mxfp4_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index cbc46810a26a..d0c8b3d1a309 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -39,15 +39,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): value_layout = StridedLayout scale_layout = StridedLayout elif current_platform.is_rocm(): - from triton_kernels.tensor_details.layout import ( - GFX950MXScaleLayout, - StridedLayout, - ) - from vllm.platforms.rocm import on_gfx950 value_layout = StridedLayout - scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout + if on_gfx950(): + from triton_kernels.tensor_details.layout import GFX950MXScaleLayout + + scale_layout = GFX950MXScaleLayout + else: + scale_layout = StridedLayout else: value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout( mx_axis=1 From 56669c1f293d5c53b6a19ddf2f78802fa9fff2c2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 22:36:07 -0500 Subject: [PATCH 265/578] [CI] Fix mypy for `vllm/v1/worker` (#29037) Signed-off-by: yewentao256 --- tools/pre_commit/mypy.py | 2 +- vllm/model_executor/utils.py | 2 +- vllm/multimodal/utils.py | 4 +- vllm/v1/worker/cpu_worker.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 128 +++++++++++------- vllm/v1/worker/gpu_ubatch_wrapper.py | 20 ++- vllm/v1/worker/gpu_worker.py | 62 +++++---- .../worker/kv_connector_model_runner_mixin.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 28 +++- vllm/v1/worker/tpu_worker.py | 5 +- vllm/v1/worker/utils.py | 8 +- vllm/v1/worker/worker_base.py | 2 + vllm/v1/worker/xpu_worker.py | 9 +- 13 files changed, 180 insertions(+), 104 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 8d04848f8f78..34f6e8c928ff 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -38,6 +38,7 @@ "vllm/usage", "vllm/v1/core", "vllm/v1/engine", + "vllm/v1/worker", ] # After fixing errors resulting from changing follow_imports @@ -62,7 +63,6 @@ "vllm/v1/sample", "vllm/v1/spec_decode", "vllm/v1/structured_output", - "vllm/v1/worker", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 759b809433b1..8aad59e84ff2 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -10,7 +10,7 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer -def set_random_seed(seed: int) -> None: +def set_random_seed(seed: int | None) -> None: from vllm.platforms import current_platform current_platform.seed_everything(seed) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3f55c46ca334..ac89bdacc01d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,7 +3,7 @@ import asyncio import atexit -from collections.abc import Iterable, Set +from collections.abc import Generator, Set from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path @@ -403,7 +403,7 @@ def group_mm_kwargs_by_modality( pin_memory: bool = False, merge_by_field_config: bool | None = None, multimodal_cpu_fields: Set[str] = frozenset(), -) -> Iterable[tuple[str, int, BatchedTensorInputs]]: +) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]: """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same modality together into the same `MultiModalKwargs` instance. diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 4420a057d1e5..b080fea1d2dd 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -3,6 +3,7 @@ import os import platform from collections.abc import Callable +from typing import Any import torch @@ -37,6 +38,9 @@ def __init__( self.parallel_config.disable_custom_all_reduce = True + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" @@ -80,13 +84,13 @@ def init_device(self): self.local_omp_cpuid = "nobind" else: local_dp_rank = self.parallel_config.data_parallel_rank_local - omp_cpuids = omp_cpuids.split("|") + omp_cpuids_list = omp_cpuids.split("|") if local_dp_rank is not None: world_size = self.parallel_config.world_size - omp_cpuids = omp_cpuids[ + omp_cpuids_list = omp_cpuids_list[ local_dp_rank * world_size : (local_dp_rank + 1) * world_size ] - self.local_omp_cpuid = omp_cpuids[self.rank] + self.local_omp_cpuid = omp_cpuids_list[self.rank] if self.local_omp_cpuid != "nobind": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) @@ -120,7 +124,7 @@ def wake_up(self, tags: list[str] | None = None) -> None: pass def determine_available_memory(self) -> int: - return self.cache_config.cpu_kvcache_space_bytes # type: ignore + return self.cache_config.cpu_kvcache_space_bytes or 0 def compile_or_warm_up_model(self) -> None: # Reset the seed to ensure that the random state is not affected by diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4b0a08ab57e1..a7fa68b20ac5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import itertools import time from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Iterator, Sequence from contextlib import contextmanager from copy import copy, deepcopy from functools import reduce @@ -53,6 +53,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( + SupportsMRoPE, SupportsMultiModal, is_mixture_of_experts, supports_eagle3, @@ -126,6 +127,7 @@ ) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs +from vllm.v1.sample.logits_processor.interface import LogitsProcessor from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler @@ -404,7 +406,10 @@ def __init__( # solution, we initialize the input batch here, and re-initialize it # in `initialize_kv_cache` if the block_sizes here is different from # the block_sizes in the kv cache config. - custom_logitsprocs = model_config.logits_processors + logits_processors = model_config.logits_processors + custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = ( + tuple(logits_processors) if logits_processors is not None else () + ) self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, # We need to use the encoder length for encoder-decoer @@ -959,9 +964,13 @@ def _update_states_after_model_execute( def _init_mrope_positions(self, req_state: CachedRequestState): model = self.get_model() assert supports_mrope(model), "M-RoPE support is not implemented." + assert req_state.prompt_token_ids is not None, ( + "M-RoPE requires prompt_token_ids to be available." + ) + mrope_model = cast(SupportsMRoPE, model) req_state.mrope_positions, req_state.mrope_position_delta = ( - model.get_mrope_input_positions( + mrope_model.get_mrope_input_positions( req_state.prompt_token_ids, req_state.mm_features, ) @@ -1762,6 +1771,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len + assert req.mrope_position_delta is not None MRotaryEmbedding.get_next_input_positions_tensor( out=self.mrope_positions.np, out_offset=dst_start, @@ -1907,6 +1917,8 @@ def _batch_mm_kwargs_from_scheduler( for mm_input_id in encoder_input_ids: mm_feature = req_state.mm_features[mm_input_id] + if mm_feature.data is None: + continue mm_hash = mm_feature.identifier mm_kwargs.append(mm_feature.data) mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) @@ -1930,7 +1942,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): # multimodal inputs. The proper solution should be reordering the # encoder outputs. model = cast(SupportsMultiModal, self.model) - encoder_outputs = [] + encoder_outputs: list[torch.Tensor] = [] for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( mm_kwargs, device=self.device, @@ -1938,7 +1950,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ): - curr_group_outputs = [] + curr_group_outputs: list[torch.Tensor] = [] # EVS-related change. # (ekhvedchenia): Temporary hack to limit peak memory usage when @@ -1980,7 +1992,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # type: ignore[assignment] sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2180,7 +2192,7 @@ def get_supported_tasks(self) -> tuple[SupportedTask, ...]: def sync_and_slice_intermediate_tensors( self, num_tokens: int, - intermediate_tensors: IntermediateTensors, + intermediate_tensors: IntermediateTensors | None, sync_self: bool, ) -> IntermediateTensors: assert self.intermediate_tensors is not None @@ -2397,6 +2409,7 @@ def _preprocess( if is_first_rank: intermediate_tensors = None else: + assert intermediate_tensors is not None intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True ) @@ -2765,14 +2778,14 @@ def execute_model( uniform_decode = ( max_num_scheduled_tokens == self.uniform_decode_query_len ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) - batch_descriptor = BatchDescriptor( + batch_desc = BatchDescriptor( num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=len(self.input_batch.lora_id_to_lora_request) > 0, ) cudagraph_runtime_mode, batch_descriptor = ( self.cudagraph_dispatcher.dispatch( - batch_descriptor, + batch_desc, use_cascade_attn=cascade_attn_prefix_lens is not None, ) ) @@ -2856,15 +2869,15 @@ def execute_model( else: logits = self.model.compute_logits(sample_hidden_states) - model_output_broadcast_data = {} + model_output_broadcast_data: dict[str, Any] = {} if logits is not None: model_output_broadcast_data["logits"] = logits.contiguous() - model_output_broadcast_data = get_pp_group().broadcast_tensor_dict( + broadcasted = get_pp_group().broadcast_tensor_dict( model_output_broadcast_data, src=len(get_pp_group().ranks) - 1 ) - assert model_output_broadcast_data is not None - logits = model_output_broadcast_data["logits"] + assert broadcasted is not None + logits = broadcasted["logits"] self.execute_model_state = ExecuteModelState( scheduler_output, @@ -2889,7 +2902,7 @@ def sample_tokens( if self.execute_model_state is None: # Nothing to do (PP non-final rank case), output isn't used. if not kv_connector_output: - return None # noqa + return None # type: ignore[return-value] # In case of PP with kv transfer, we need to pass through the # kv_connector_output @@ -2941,33 +2954,37 @@ def propose_draft_token_ids( spec_decode_common_attn_metadata, ) + spec_config = self.speculative_config use_padded_batch_for_eagle = ( - self.speculative_config - and self.speculative_config.use_eagle() - and not self.speculative_config.disable_padded_drafter_batch + spec_config is not None + and spec_config.use_eagle() + and not spec_config.disable_padded_drafter_batch ) effective_drafter_max_model_len = self.max_model_len if effective_drafter_max_model_len is None: effective_drafter_max_model_len = self.model_config.max_model_len if ( - self.speculative_config - and self.speculative_config.draft_model_config is not None - and self.speculative_config.draft_model_config.max_model_len is not None + spec_config is not None + and spec_config.draft_model_config is not None + and spec_config.draft_model_config.max_model_len is not None ): effective_drafter_max_model_len = ( - self.speculative_config.draft_model_config.max_model_len + spec_config.draft_model_config.max_model_len ) input_fits_in_drafter = spec_decode_common_attn_metadata and ( spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len ) if use_padded_batch_for_eagle: + assert self.speculative_config is not None + assert isinstance(self.drafter, EagleProposer) sampled_token_ids = sampler_output.sampled_token_ids if input_fits_in_drafter: # EAGLE speculative decoding can use the GPU sampled tokens # as inputs, and does not need to wait for bookkeeping to finish. propose_draft_token_ids(sampled_token_ids) elif self.valid_sampled_token_count_event is not None: + assert spec_decode_common_attn_metadata is not None next_token_ids, valid_sampled_tokens_count = ( self.drafter.prepare_next_token_ids_padded( spec_decode_common_attn_metadata, @@ -3105,7 +3122,9 @@ def propose_draft_token_ids( common_attn_metadata: CommonAttentionMetadata, ) -> torch.Tensor | list[list[int]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if self.speculative_config.method == "ngram": + spec_config = self.speculative_config + assert spec_config is not None + if spec_config.method == "ngram": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, NgramProposer) draft_token_ids = self.drafter.propose( @@ -3115,11 +3134,11 @@ def propose_draft_token_ids( self.input_batch.token_ids_cpu, self.input_batch.spec_decode_unsupported_reqs, ) - elif self.speculative_config.method == "suffix": + elif spec_config.method == "suffix": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, SuffixDecodingProposer) draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids) - elif self.speculative_config.method == "medusa": + elif spec_config.method == "medusa": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, MedusaProposer) @@ -3144,10 +3163,10 @@ def propose_draft_token_ids( target_hidden_states=hidden_states, sampling_metadata=sampling_metadata, ) - elif self.speculative_config.use_eagle(): + elif spec_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - if self.speculative_config.disable_padded_drafter_batch: + if spec_config.disable_padded_drafter_batch: # When padded-batch is disabled, the sampled_token_ids should be # the cpu-side list[list[int]] of valid sampled tokens for each # request, with invalid requests having empty lists. @@ -3197,7 +3216,7 @@ def propose_draft_token_ids( else: target_hidden_states = hidden_states[:num_scheduled_tokens] else: - if self.speculative_config.disable_padded_drafter_batch: + if spec_config.disable_padded_drafter_batch: token_indices_to_sample = None common_attn_metadata, token_indices = self.drafter.prepare_inputs( common_attn_metadata, @@ -3292,9 +3311,12 @@ def load_model(self, eep_scale_up: bool = False) -> None: and is_mixture_of_experts(self.drafter.model) and self.parallel_config.enable_eplb ): + spec_config = self.vllm_config.speculative_config + assert spec_config is not None + assert spec_config.draft_model_config is not None logger.info_once( "EPLB is enabled for drafter model %s.", - self.vllm_config.speculative_config.draft_model_config.model, + spec_config.draft_model_config.model, ) global_expert_load = ( @@ -3311,7 +3333,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.eplb_state = EplbState(self.parallel_config, self.device) self.eplb_state.add_model( self.drafter.model, - self.vllm_config.speculative_config.draft_model_config, + spec_config.draft_model_config, global_expert_load, old_global_expert_indices, rank_mapping, @@ -3346,9 +3368,11 @@ def load_model(self, eep_scale_up: bool = False) -> None: scope="local", ) prepare_communication_buffer_for_model(self.model) + mm_config = self.model_config.multimodal_config self.is_multimodal_pruning_enabled = ( supports_multimodal_pruning(self.get_model()) - and self.model_config.multimodal_config.is_multimodal_pruning_enabled() + and mm_config is not None + and mm_config.is_multimodal_pruning_enabled() ) if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb: @@ -3383,15 +3407,14 @@ def load_model(self, eep_scale_up: bool = False) -> None: # CudagraphWraper and CudagraphDispatcher of vllm. # wrap the model with full cudagraph wrapper if needed. - if ( - self.compilation_config.cudagraph_mode.has_full_cudagraphs() - and not self.parallel_config.enable_dbo - ): + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None + if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo: self.model = CUDAGraphWrapper( self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL ) elif self.parallel_config.enable_dbo: - if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + if cudagraph_mode.has_full_cudagraphs(): self.model = UBatchWrapper( self.model, self.vllm_config, CUDAGraphMode.FULL, self.device ) @@ -4071,7 +4094,8 @@ def _dummy_pooler_run( def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: + mm_config = self.model_config.multimodal_config + if mm_config is not None and mm_config.skip_mm_profiling: logger.info( "Skipping memory profiling for multimodal encoder and " "encoder cache." @@ -4333,8 +4357,9 @@ class AttentionGroupKey(NamedTuple): def get_attn_backends_for_group( kv_cache_group_spec: KVCacheGroupSpec, ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]: + layer_type = cast(type[Any], AttentionLayerBase) layers = get_layers_from_vllm_config( - self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names + self.vllm_config, layer_type, kv_cache_group_spec.layer_names ) attn_backends = {} attn_backend_layers = defaultdict(list) @@ -4349,7 +4374,7 @@ def get_attn_backends_for_group( if layer_name in self.kv_sharing_fast_prefill_eligible_layers: attn_backend = create_fast_prefill_custom_backend( "FastPrefill", - attn_backend, + attn_backend, # type: ignore[arg-type] ) full_cls_name = attn_backend.full_cls_name() @@ -4448,6 +4473,7 @@ def _check_and_update_cudagraph_mode( min_cg_backend_name = attn_backend.__name__ # Flexible resolve the cudagraph mode cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None # check cudagraph for mixed batch is supported if ( cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL @@ -4562,12 +4588,17 @@ def _check_and_update_cudagraph_mode( self.compilation_config.adjust_cudagraph_sizes_for_spec_decode( self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size ) - self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes + capture_sizes = self.compilation_config.cudagraph_capture_sizes + self.cudagraph_batch_sizes = ( + capture_sizes if capture_sizes is not None else [] + ) # Trigger cudagraph dispatching keys initialization after # resolved cudagraph mode. + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None self.cudagraph_dispatcher.initialize_cudagraph_keys( - self.compilation_config.cudagraph_mode, self.uniform_decode_query_len + cudagraph_mode, self.uniform_decode_query_len ) def calculate_reorder_batch_threshold(self) -> None: @@ -4579,7 +4610,7 @@ def calculate_reorder_batch_threshold(self) -> None: """ min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b) - reorder_batch_thresholds = [ + reorder_batch_thresholds: list[int | None] = [ group.get_metadata_builder().reorder_batch_threshold for group in self._attn_group_iterator() ] @@ -4588,7 +4619,7 @@ def calculate_reorder_batch_threshold(self) -> None: if len(reorder_batch_thresholds) == 0: self.reorder_batch_threshold = None return - self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) + self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) # type: ignore[assignment] @staticmethod def select_common_block_size( @@ -5048,12 +5079,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) if self.dcp_world_size > 1: - layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layer_type = cast(type[Any], AttentionLayerBase) + layers = get_layers_from_vllm_config(self.vllm_config, layer_type) for layer in layers.values(): - assert layer.impl.need_to_return_lse_for_decode, ( + layer_impl = getattr(layer, "impl", None) + if layer_impl is None: + continue + assert layer_impl.need_to_return_lse_for_decode, ( "DCP requires attention impls to return" " the softmax lse for decode, but the impl " - f"{layer.impl.__class__.__name__} " + f"{layer_impl.__class__.__name__} " "does not return the softmax lse for decode." ) @@ -5094,7 +5129,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: if has_ec_transfer() and get_ec_transfer().is_producer: return {} kv_cache_spec: dict[str, KVCacheSpec] = {} - attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layer_type = cast(type[Any], AttentionLayerBase) + attn_layers = get_layers_from_vllm_config(self.vllm_config, layer_type) for layer_name, attn_module in attn_layers.items(): if isinstance(attn_module, Attention) and ( kv_tgt_layer := attn_module.kv_sharing_target_layer_name diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 9de123263755..2ce2b6451256 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -121,18 +121,24 @@ def __init__( @staticmethod def _create_sm_control_context(vllm_config: VllmConfig): - comm_sms = envs.VLLM_DBO_COMM_SMS + comm_sms: int = envs.VLLM_DBO_COMM_SMS set_comm_sms = lambda sms: None if vllm_config.parallel_config.enable_expert_parallel: # Currently only DeepEP highthroughput supports SM control so this # only affects that case. - all2all_manager = get_ep_group().device_communicator.all2all_manager - - if all2all_manager.max_sms_used() is not None: - comm_sms = min(comm_sms, all2all_manager.max_sms_used()) - - if comm_sms > 0: + ep_group = get_ep_group() + device_communicator = ep_group.device_communicator + all2all_manager = None + if device_communicator is not None: + all2all_manager = device_communicator.all2all_manager + + if all2all_manager is not None: + max_sms_used = all2all_manager.max_sms_used() + if max_sms_used is not None: + comm_sms = min(comm_sms, max_sms_used) + + if comm_sms > 0 and all2all_manager is not None: set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms) # TODO(lucas): support other kernels besides DeepGEMM diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 18cbc3826279..f1fd5be966c3 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,7 +6,7 @@ import os from contextlib import AbstractContextManager, nullcontext from types import NoneType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import torch import torch.distributed @@ -87,8 +87,10 @@ def __init__( # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - # Torch profiler. Enabled and configured through env vars: + # Torch/CUDA profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # VLLM_TORCH_CUDA_PROFILE=1 + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" self.profiler = TorchProfilerWrapper( @@ -146,17 +148,17 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager: assert allocator.get_current_usage() == 0, ( "Sleep mode can only be used for one instance per process." ) - context = allocator.use_memory_pool(tag=tag) + return allocator.use_memory_pool(tag=tag) else: - context = nullcontext() - return context + return nullcontext() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks def init_device(self): - if self.device_config.device.type == "cuda": + device = self.device_config.device + if isinstance(device, torch.device) and device.type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) if ( @@ -375,23 +377,21 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: from vllm.device_allocator.cumem import CuMemAllocator allocator = CuMemAllocator.get_instance() - context = allocator.use_memory_pool(tag="kv_cache") + with allocator.use_memory_pool(tag="kv_cache"): + self.model_runner.initialize_kv_cache(kv_cache_config) else: - context = nullcontext() - with context: self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: # warm up sizes that are not in cudagraph capture sizes, # but users still want to compile for better performance, # e.g. for the max-num-batched token size in chunked prefill. - warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + compile_sizes = self.vllm_config.compilation_config.compile_sizes + warmup_sizes = compile_sizes.copy() if compile_sizes is not None else [] if not self.model_config.enforce_eager: - warmup_sizes = [ - x - for x in warmup_sizes - if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes - ] + capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes + if capture_sizes is not None: + warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes] # We skip EPLB here since we don't want to record dummy metrics for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) @@ -532,12 +532,12 @@ def execute_model( ) } if forward_pass and not get_pp_group().is_first_rank: - intermediate_tensors = IntermediateTensors( - get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group(), - all_gather_tensors=all_gather_tensors, - ) + tensor_dict = get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors, ) + assert tensor_dict is not None + intermediate_tensors = IntermediateTensors(tensor_dict) with self.annotate_profile(scheduler_output): output = self.model_runner.execute_model( @@ -605,7 +605,7 @@ def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None: assert self.model_runner.eplb_state is not None self.model_runner.eplb_state.rearrange( execute_shuffle=True, - global_expert_load=None, + global_expert_loads=None, rank_mapping=rank_mapping, ) torch.cuda.synchronize() @@ -661,7 +661,7 @@ def _reconfigure_parallel_config( def _reconfigure_moe( self, old_ep_size: int, new_ep_size: int - ) -> torch.Tensor | None: + ) -> list[torch.Tensor] | None: """ Reconfigure MoE modules with provided reconfig_request @@ -728,26 +728,29 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int): num_local_physical_experts = num_local_experts assert self.model_runner.eplb_state is not None new_physical_experts = ( - self.model_runner.eplb_state.physical_to_logical_map.shape[1] + self.model_runner.eplb_state.physical_to_logical_map.shape[1] # type: ignore[attr-defined] ) parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - - self.model_runner.eplb_state.logical_replica_count.shape[1] + - self.model_runner.eplb_state.logical_replica_count.shape[1] # type: ignore[attr-defined] ) global_expert_loads = None else: - num_local_physical_experts = torch.tensor( + num_local_physical_experts_tensor = torch.tensor( [num_local_experts], dtype=torch.int32, device="cpu" ) torch.distributed.broadcast( - num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0 + num_local_physical_experts_tensor, + group=get_ep_group().cpu_group, + group_src=0, ) - num_local_physical_experts = num_local_physical_experts.item() + num_local_physical_experts = int(num_local_physical_experts_tensor.item()) new_physical_experts = num_local_physical_experts * new_ep_size assert self.model_runner.eplb_state is not None - global_expert_loads = self.model_runner.eplb_state.rearrange( + global_expert_loads_any = self.model_runner.eplb_state.rearrange( execute_shuffle=False ) + global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any) parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_loads[0].shape[1] ) @@ -849,8 +852,9 @@ def init_worker_distributed_environment( init_batch_invariance() set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + init_method = distributed_init_method or "env://" init_distributed_environment( - parallel_config.world_size, rank, distributed_init_method, local_rank, backend + parallel_config.world_size, rank, init_method, local_rank, backend ) ensure_model_parallel_initialized( diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index e59361f21372..ff047d8d03f0 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -59,7 +59,7 @@ def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): @staticmethod def ensure_kv_transfer_shutdown() -> None: # has_kv_transfer_group can be None during interpreter shutdown. - if has_kv_transfer_group and has_kv_transfer_group(): + if has_kv_transfer_group and has_kv_transfer_group(): # type: ignore[truthy-function] ensure_kv_transfer_shutdown() @staticmethod diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 923c31c187f3..450160d28649 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -572,7 +572,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: format. Layers that do not need KV cache are not included. """ - layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layers = get_layers_from_vllm_config( + self.vllm_config, + AttentionLayerBase, # type: ignore[type-abstract] + ) block_size = self.vllm_config.cache_config.block_size cache_dtype_str = self.vllm_config.cache_config.cache_dtype @@ -725,7 +728,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int) req_id = self.input_batch.req_ids[i] assert req_id is not None num_tokens = scheduler_output.num_scheduled_tokens[req_id] - if not use_max_model_len and num_tokens > self.most_model_len: + if ( + not use_max_model_len + and self.most_model_len is not None + and num_tokens > self.most_model_len + ): use_max_model_len = True num_scheduled_tokens_per_req.append(num_tokens) if use_max_model_len: @@ -737,6 +744,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int) else: end_index = num_reqs else: + assert self.num_reqs_most_model_len is not None if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len: num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[ : self.num_reqs_most_model_len @@ -829,6 +837,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int) ].to(self.device) seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device) else: + assert self.num_reqs_most_model_len is not None block_tables = self.block_table_cpu[ : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req ] @@ -931,6 +940,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): for mm_input_id in encoder_input_ids: mm_feature = req_state.mm_features[mm_input_id] + if mm_feature.data is None: + continue mm_hash = mm_feature.identifier mm_kwargs.append(mm_feature.data) mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) @@ -1114,7 +1125,7 @@ def sample_tokens( ) -> ModelRunnerOutput: if self.scheduler_output is None: # Nothing to do (PP non-final rank case), output isn't used. - return None # noqa + return None # type: ignore[return-value] scheduler_output = self.scheduler_output mm_embed_inputs = self.mm_embed_inputs self.scheduler_output = None @@ -1696,7 +1707,8 @@ def profile_run( ) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: + mm_config = self.model_config.multimodal_config + if mm_config is not None and mm_config.skip_mm_profiling: logger.info( "Skipping memory profiling for multimodal encoder and " "encoder cache." @@ -2166,5 +2178,9 @@ def _tpu_reset_lora(self, index: int): if isinstance(module, BaseLayerWithLoRA): module._original_set_lora = module.set_lora module._original_reset_lora = module.reset_lora - module.set_lora = _tpu_set_lora.__get__(module, module.__class__) - module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__) + module.set_lora = _tpu_set_lora.__get__( # type: ignore[method-assign] + module, module.__class__ + ) + module.reset_lora = _tpu_reset_lora.__get__( # type: ignore[method-assign] + module, module.__class__ + ) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index a716a9c3aa82..569b2aaa766e 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -141,8 +141,7 @@ def init_device(self): # Set random seed. set_random_seed(self.model_config.seed) - if self.model_config.seed is not None: - xm.set_rng_state(self.model_config.seed, self.device) + xm.set_rng_state(self.model_config.seed, self.device) # Increase the cache size limit, which is the maximum number of # dynamo graphs that can be compiled. @@ -332,7 +331,7 @@ def _init_tpu_worker_distributed_environment( world_size=parallel_config.world_size, rank=rank, local_rank=local_rank, - distributed_init_method=distributed_init_method, + distributed_init_method=distributed_init_method or "env://", backend=current_platform.dist_backend, ) ensure_model_parallel_initialized( diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 9e99ea964ee0..92e4ce3abdba 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -280,7 +280,7 @@ def bind_kv_cache( kv_caches: dict[str, torch.Tensor], forward_context: dict[str, "Attention"], runner_kv_caches: list[torch.Tensor], - num_attn_module: int | None = 1, + num_attn_module: int = 1, ) -> None: """ Bind the allocated KV cache to both ModelRunner and forward context so @@ -362,5 +362,7 @@ def is_residual_scattered_for_sp( or vllm_config.compilation_config.use_inductor_graph_partition ): return True - - return num_input_tokens in vllm_config.compilation_config.compile_sizes + compile_sizes = vllm_config.compilation_config.compile_sizes + if compile_sizes is None: + return False + return num_input_tokens in compile_sizes diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 16f321c08077..57e7037e946e 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -315,10 +315,12 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: def initialize_from_config(self, kv_cache_configs: list[Any]) -> None: kv_cache_config = kv_cache_configs[self.global_rank] + assert self.vllm_config is not None with set_current_vllm_config(self.vllm_config): self.worker.initialize_from_config(kv_cache_config) # type: ignore def init_device(self): + assert self.vllm_config is not None with set_current_vllm_config(self.vllm_config): # To make vLLM config available during device initialization self.worker.init_device() # type: ignore diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 26c6f8d06bdc..4d7864e90496 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os +from typing import Any import torch import torch.distributed @@ -37,6 +38,7 @@ def __init__( # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" @@ -148,7 +150,12 @@ def determine_available_memory(self) -> int: return int(available_kv_cache_memory) def init_device(self): - if self.device_config.device.type == "xpu" and current_platform.is_xpu(): + device = self.device_config.device + if ( + isinstance(device, torch.device) + and device.type == "xpu" + and current_platform.is_xpu() + ): self.device = torch.device(f"xpu:{self.local_rank}") current_platform.set_device(self.device) current_platform.check_if_supports_dtype(self.model_config.dtype) From 0e741c12e3dc45093b2ddab8a31310703aa27002 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 11:38:35 +0800 Subject: [PATCH 266/578] [Bugfix] Fix Plamo3 rope handling (#29092) Signed-off-by: DarkLight1337 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/plamo3.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 5bb07722a5fc..4aeb9d432dcc 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -62,7 +62,7 @@ class Plamo3Config(PretrainedConfig): # type: ignore # if `sliding_window` is list interleaved_sliding_window: list[int | None] sliding_window_pattern: int - rope_theta: int + rope_parameters: dict[str, Any] rope_local_theta: int # MLP intermediate_size: int @@ -153,13 +153,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No quant_config=quant_config, prefix=f"{prefix}.o_proj", ) + layer_idx = extract_layer_index(prefix) - full_attn = config.interleaved_sliding_window[layer_idx] is None + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" - self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) + # Initialize the rotary embedding. + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] + else: + # Transformers v4 rope config. + # Global attention. Use the values in config.json. + rope_parameters = config.rope_parameters + # Local attention. Override the values in config.json. + if is_sliding: + rope_parameters = dict( + rope_type="default", rope_theta=config.rope_local_theta + ) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -170,8 +181,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) set_weight_attrs( From a982f5b5ea4a1932424927ea357b532d0e45caf1 Mon Sep 17 00:00:00 2001 From: zhrrr <43847754+izhuhaoran@users.noreply.github.com> Date: Fri, 21 Nov 2025 11:39:09 +0800 Subject: [PATCH 267/578] [kernel][perf] support uncontiguous input for rms_norm kernel (#28103) Signed-off-by: zhuhaoran Signed-off-by: izhuhaoran Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/dispatch_utils.h | 21 ++++++++ csrc/layernorm_kernels.cu | 80 +++++++++++++++++++++---------- vllm/_custom_ops.py | 5 +- vllm/compilation/matcher_utils.py | 4 +- 4 files changed, 77 insertions(+), 33 deletions(-) diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 9ae0ed975edd..e1d131e4a785 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -117,3 +117,24 @@ break; \ } \ } + +#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \ + switch (NUM_DIMS) { \ + case 2: { \ + constexpr int tensor_rank = 2; \ + __VA_ARGS__(); \ + break; \ + } \ + case 3: { \ + constexpr int tensor_rank = 3; \ + __VA_ARGS__(); \ + break; \ + } \ + case 4: { \ + constexpr int tensor_rank = 4; \ + __VA_ARGS__(); \ + break; \ + } \ + default: \ + TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \ + } diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 48771e4b3aff..dfc67b933cca 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -10,16 +10,38 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. -template +template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] - const int64_t input_stride, + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride_d2, // input.stride(-2) + const int64_t input_stride_d3, // input.stride(-3) + const int64_t input_stride_d4, // input.stride(-4) + const int64_t input_shape_d2, // input.size(-2) + const int64_t input_shape_d3, // input.size(-3) const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; - const scalar_t* input_row = input + blockIdx.x * input_stride; + const scalar_t* input_row; + if constexpr (NUM_DIMS == 2) { + // 2D for layernorm normal case [batch_size, hidden] + input_row = input + blockIdx.x * input_stride_d2; + } else if constexpr (NUM_DIMS == 3) { + // 3D for q/k norm [batch_size, num_heads, head_size] + int batch_idx = blockIdx.x / input_shape_d2; + int head_idx = blockIdx.x % input_shape_d2; + input_row = + input + batch_idx * input_stride_d3 + head_idx * input_stride_d2; + } else if constexpr (NUM_DIMS == 4) { + // 4D for transformers model_impl qk norm [batch, seq, head, head_dim] + int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2); + int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2); + int seq_idx = remaining / input_shape_d2; + int head_idx = remaining % input_shape_d2; + input_row = input + batch_idx * input_stride_d4 + + seq_idx * input_stride_d3 + head_idx * input_stride_d2; + } auto vec_op = [&variance](const vec_n_t& vec) { #pragma unroll @@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { TORCH_CHECK(out.is_contiguous()); + if (input.stride(-1) != 1) { + input = input.contiguous(); + } TORCH_CHECK(input.stride(-1) == 1); TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); - // We cannot just use `input.stride(-2)` if the tensor is not row-major. - // Instead, we use a 2d view to get the second-innermost stride. - // That way the dimensions (except the last one) can be arbitrarily permuted. - torch::Tensor input_view = input.view({-1, hidden_size}); - - int num_tokens = input_view.numel() / hidden_size; - int64_t input_stride = input_view.stride(-2); + int num_tokens = input.numel() / hidden_size; + int num_dims = input.dim(); + int64_t input_stride_d2 = input.stride(-2); + int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0; + int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0; + int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0; + int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0; // For large num_tokens, use smaller blocks to increase SM concurrency. const int max_block_size = (num_tokens < 256) ? 1024 : 256; dim3 grid(num_tokens); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input_view.scalar_type(), "rms_norm_kernel", [&] { - const int calculated_vec_size = - std::gcd(16 / sizeof(scalar_t), hidden_size); - const int block_size = - std::min(hidden_size / calculated_vec_size, max_block_size); - dim3 block(block_size); - VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { - vllm::rms_norm_kernel<<>>( - out.data_ptr(), input_view.data_ptr(), - input_stride, weight.data_ptr(), epsilon, num_tokens, - hidden_size); - }); + VLLM_DISPATCH_RANK234(num_dims, [&] { + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + const int calculated_vec_size = + std::gcd(16 / sizeof(scalar_t), hidden_size); + const int block_size = + std::min(hidden_size / calculated_vec_size, max_block_size); + dim3 block(block_size); + VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { + vllm::rms_norm_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + input_stride_d2, input_stride_d3, input_stride_d4, + input_shape_d2, input_shape_d3, weight.data_ptr(), + epsilon, num_tokens, hidden_size); }); + }); + }); } #define LAUNCH_FUSED_ADD_RMS_NORM(width) \ diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 66cf6472eee4..0f625a794524 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -328,10 +328,7 @@ def rotary_embedding( def rms_norm( out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float ) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input - # If removed, also need to remove contiguous in MatcherRMSNorm - input_contiguous = input.contiguous() - torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon) + torch.ops._C.rms_norm(out, input, weight, epsilon) def fused_add_rms_norm( diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py index 38eb4e5301a1..e4cd063d2aee 100644 --- a/vllm/compilation/matcher_utils.py +++ b/vllm/compilation/matcher_utils.py @@ -162,12 +162,10 @@ def forward_custom( weight: torch.Tensor, ) -> torch.Tensor: result = torch.empty_like(input) - # TODO: support non-contiguous input for RMSNorm and remove this - input_contiguous = input.contiguous() _, result = auto_functionalized( RMS_OP, result=result, - input=input_contiguous, + input=input, weight=weight, epsilon=self.epsilon, ) From 0730414999343e722590ace615d5814c7e5b6827 Mon Sep 17 00:00:00 2001 From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:39:47 -0800 Subject: [PATCH 268/578] [Core] Add audio_embeds support to chat completions (#29059) Signed-off-by: Jeremy Teboul Co-authored-by: Jeremy Teboul --- docs/features/multimodal_inputs.md | 32 ++++++ tests/entrypoints/test_chat_utils.py | 145 ++++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 149 ++++++++++++++++++++++++++- vllm/multimodal/audio.py | 24 +++++ vllm/multimodal/utils.py | 13 ++- 5 files changed, 360 insertions(+), 3 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 5f684604e603..4656ee43ea25 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`. The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users! +#### Image Embeddings + ??? code ```python @@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd print(generated_text) ``` +#### Audio Embeddings + +You can pass pre-computed audio embeddings similar to image embeddings: + +??? code + + ```python + from vllm import LLM + import torch + + # Enable audio embeddings support + llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True) + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: